/*        $NetBSD: trap.c,v 1.121 2019/07/13 17:03:01 mlelstv Exp $        */
      
      /*
       * Copyright (c) 1998, 2000, 2017 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Charles M. Hannum, and by Maxime Villard.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Copyright (c) 1990 The Regents of the University of California.
       * All rights reserved.
       *
       * This code is derived from software contributed to Berkeley by
       * the University of Utah, and William Jolitz.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)trap.c        7.4 (Berkeley) 5/13/91
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.121 2019/07/13 17:03:01 mlelstv Exp $");
      
      #include "opt_ddb.h"
      #include "opt_kgdb.h"
      #include "opt_xen.h"
      #include "opt_dtrace.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/proc.h>
      #include <sys/acct.h>
      #include <sys/kauth.h>
      #include <sys/kernel.h>
      #include <sys/kmem.h>
      #include <sys/ras.h>
      #include <sys/signal.h>
      #include <sys/syscall.h>
      #include <sys/cpu.h>
      #include <sys/ucontext.h>
      
      #include <uvm/uvm_extern.h>
      
      #ifdef COMPAT_NETBSD32
      #include <sys/exec.h>
      #include <compat/netbsd32/netbsd32_exec.h>
      #endif
      
      #include <machine/cpufunc.h>
      #include <x86/fpu.h>
      #include <x86/dbregs.h>
      #include <machine/psl.h>
      #include <machine/reg.h>
      #include <machine/trap.h>
      #include <machine/userret.h>
      #include <machine/db_machdep.h>
      
      #include <x86/nmi.h>
      
      #ifndef XENPV
      #include "isa.h"
      #endif
      
      #include <sys/kgdb.h>
      
      #ifdef KDTRACE_HOOKS
      #include <sys/dtrace_bsd.h>
      
      /*
       * This is a hook which is initialized by the dtrace module
       * to handle traps which might occur during DTrace probe
       * execution.
       */
      dtrace_trap_func_t        dtrace_trap_func = NULL;
      
      dtrace_doubletrap_func_t        dtrace_doubletrap_func = NULL;
      #endif
      
      void nmitrap(struct trapframe *);
      void doubletrap(struct trapframe *);
      void trap(struct trapframe *);
      void trap_return_fault_return(struct trapframe *) __dead;
      
      const char * const trap_type[] = {
              "privileged instruction fault",                /*  0 T_PRIVINFLT */
              "breakpoint trap",                        /*  1 T_BPTFLT */
              "arithmetic trap",                        /*  2 T_ARITHTRAP */
              "asynchronous system trap",                /*  3 T_ASTFLT */
              "protection fault",                        /*  4 T_PROTFLT */
              "trace trap",                                /*  5 T_TRCTRAP */
              "page fault",                                /*  6 T_PAGEFLT */
              "alignment fault",                        /*  7 T_ALIGNFLT */
              "integer divide fault",                        /*  8 T_DIVIDE */
              "non-maskable interrupt",                /*  9 T_NMI */
              "overflow trap",                        /* 10 T_OFLOW */
              "bounds check fault",                        /* 11 T_BOUND */
              "FPU not available fault",                /* 12 T_DNA */
              "double fault",                                /* 13 T_DOUBLEFLT */
              "FPU operand fetch fault",                /* 14 T_FPOPFLT */
              "invalid TSS fault",                        /* 15 T_TSSFLT */
              "segment not present fault",                /* 16 T_SEGNPFLT */
              "stack fault",                                /* 17 T_STKFLT */
              "machine check fault",                        /* 18 T_MCA */
              "SSE FP exception",                        /* 19 T_XMM */
              "reserved trap",                        /* 20 T_RESERVED */
      };
      int        trap_types = __arraycount(trap_type);
      
      #define        IDTVEC(name)        __CONCAT(X, name)
      
      #ifdef TRAP_SIGDEBUG
      static void sigdebug(const struct trapframe *, const ksiginfo_t *, int);
      #define SIGDEBUG(a, b, c) sigdebug(a, b, c)
      #else
      #define SIGDEBUG(a, b, c)
      #endif
      
      static void
      onfault_restore(struct trapframe *frame, void *onfault, int error)
      {
    1         frame->tf_rip = (uintptr_t)onfault;
              frame->tf_rax = error;
      }
      
      static void *
      onfault_handler(const struct pcb *pcb, const struct trapframe *tf)
      {
              struct onfault_table {
                      uintptr_t start;
                      uintptr_t end;
                      void *handler;
              };
              extern const struct onfault_table onfault_table[];
              const struct onfault_table *p;
              uintptr_t pc;
      
    1         if (pcb->pcb_onfault != NULL) {
                      return pcb->pcb_onfault;
              }
      
    1         pc = tf->tf_rip;
    1         for (p = onfault_table; p->start; p++) {
    1                 if (p->start <= pc && pc < p->end) {
    1                         return p->handler;
                      }
              }
    1         return NULL;
      }
      
      static void
      trap_print(const struct trapframe *frame, const lwp_t *l)
      {
              const int type = frame->tf_trapno;
      
              if (frame->tf_trapno < trap_types) {
                      printf("fatal %s", trap_type[type]);
              } else {
                      printf("unknown trap %d", type);
              }
              printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor");
      
              printf("trap type %d code %#lx rip %#lx cs %#lx rflags %#lx cr2 %#lx "
                  "ilevel %#x rsp %#lx\n",
                  type, frame->tf_err, (u_long)frame->tf_rip, frame->tf_cs,
                  frame->tf_rflags, rcr2(), curcpu()->ci_ilevel, frame->tf_rsp);
      
              printf("curlwp %p pid %d.%d lowest kstack %p\n",
                  l, l->l_proc->p_pid, l->l_lid, KSTACK_LOWEST_ADDR(l));
      }
      
      void
      nmitrap(struct trapframe *frame)
      {
              const int type = T_NMI;
      
              if (nmi_dispatch(frame))
                      return;
              /* NMI can be hooked up to a pushbutton for debugging */
              if (kgdb_trap(type, frame))
                      return;
              if (kdb_trap(type, 0, frame))
                      return;
              /* machine/parity/power fail/"kitchen sink" faults */
      
              x86_nmi();
      }
      
      void
      doubletrap(struct trapframe *frame)
      {
              const int type = T_DOUBLEFLT;
              struct lwp *l = curlwp;
      
              trap_print(frame, l);
      
              if (kdb_trap(type, 0, frame))
                      return;
              if (kgdb_trap(type, frame))
                      return;
      
              panic("double fault");
      }
      
      /*
       * trap(frame): exception, fault, and trap interface to BSD kernel.
       *
       * This common code is called from assembly language IDT gate entry routines
       * that prepare a suitable stack frame, and restore this frame after the
       * exception has been processed. Note that the effect is as if the arguments
       * were passed call by reference.
       *
       * Note that the fpu traps (07 T_DNA, 10 T_ARITHTRAP and 13 T_XMM)
       * jump directly into the code in x86/fpu.c so they get processed
       * without interrupts being enabled.
       */
      void
      trap(struct trapframe *frame)
      {
    2         struct lwp *l = curlwp;
              struct proc *p;
              struct pcb *pcb;
              extern char kcopy_fault[];
              extern char IDTVEC(osyscall)[];
              extern char IDTVEC(syscall32)[];
              ksiginfo_t ksi;
              void *onfault;
              int type, error;
              uint64_t cr2;
              bool pfail;
      
              if (__predict_true(l != NULL)) {
    2                 pcb = lwp_getpcb(l);
                      p = l->l_proc;
              } else {
                      /*
                       * this can happen eg. on break points in early on boot.
                       */
                      pcb = NULL;
                      p = NULL;
              }
    2         type = frame->tf_trapno;
      
              if (!KERNELMODE(frame->tf_cs)) {
                      type |= T_USER;
                      l->l_md.md_regs = frame;
                      LWP_CACHE_CREDS(l, p);
              }
      
      #ifdef KDTRACE_HOOKS
              /*
               * A trap can occur while DTrace executes a probe. Before
               * executing the probe, DTrace blocks re-scheduling and sets
               * a flag in its per-cpu flags to indicate that it doesn't
               * want to fault. On returning from the probe, the no-fault
               * flag is cleared and finally re-scheduling is enabled.
               *
               * If the DTrace kernel module has registered a trap handler,
               * call it and if it returns non-zero, assume that it has
               * handled the trap and modified the trap frame so that this
               * function can return normally.
               */
    2         if ((type == T_PROTFLT || type == T_PAGEFLT) &&
    2             dtrace_trap_func != NULL) {
    2                 if ((*dtrace_trap_func)(frame, type)) {
                              return;
                      }
              }
      #endif
      
              switch (type) {
      
              default:
              we_re_toast:
                      trap_print(frame, l);
      
                      if (kdb_trap(type, 0, frame))
                              return;
                      if (kgdb_trap(type, frame))
                              return;
                      /*
                       * If this is a breakpoint, don't panic if we're not connected.
                       */
                      if (type == T_BPTFLT && kgdb_disconnected()) {
                              printf("kgdb: ignored %s\n", trap_type[type]);
                              return;
                      }
                      panic("trap");
                      /*NOTREACHED*/
      
              case T_PROTFLT:
              case T_SEGNPFLT:
              case T_ALIGNFLT:
              case T_STKFLT:
              case T_TSSFLT:
                      if (p == NULL)
                              goto we_re_toast;
      
                      /* Check for copyin/copyout fault. */
                      onfault = onfault_handler(pcb, frame);
                      if (onfault != NULL) {
                              onfault_restore(frame, onfault, EFAULT);
                              return;
                      }
      
                      goto we_re_toast;
      
              case T_PROTFLT|T_USER:                /* protection fault */
      #if defined(COMPAT_NETBSD32) && defined(COMPAT_10)
      
      /*
       * XXX This code currently not included in loadable module;  it is
       * only included in built-in modules.
       */
              {
                      static const char lcall[7] = { 0x9a, 0, 0, 0, 0, 7, 0 };
                      const size_t sz = sizeof(lcall);
                      char tmp[sz];
      
                      /* Check for the oosyscall lcall instruction. */
                      if (p->p_emul == &emul_netbsd32 &&
                          frame->tf_rip < VM_MAXUSER_ADDRESS32 - sz &&
                          copyin((void *)frame->tf_rip, tmp, sz) == 0 &&
                          memcmp(tmp, lcall, sz) == 0) {
      
                              /* Advance past the lcall. */
                              frame->tf_rip += sz;
      
                              /* Do the syscall. */
                              p->p_md.md_syscall(frame);
                              goto out;
                      }
              }
      #endif
                      /* FALLTHROUGH */
              case T_TSSFLT|T_USER:
              case T_SEGNPFLT|T_USER:
              case T_STKFLT|T_USER:
              case T_ALIGNFLT|T_USER:
                      KSI_INIT_TRAP(&ksi);
                      ksi.ksi_trap = type & ~T_USER;
                      ksi.ksi_addr = (void *)frame->tf_rip;
                      switch (type) {
                      case T_SEGNPFLT|T_USER:
                      case T_STKFLT|T_USER:
                              ksi.ksi_signo = SIGBUS;
                              ksi.ksi_code = BUS_ADRERR;
                              break;
                      case T_TSSFLT|T_USER:
                              ksi.ksi_signo = SIGBUS;
                              ksi.ksi_code = BUS_OBJERR;
                              break;
                      case T_ALIGNFLT|T_USER:
                              ksi.ksi_signo = SIGBUS;
                              ksi.ksi_code = BUS_ADRALN;
                              break;
                      case T_PROTFLT|T_USER:
                              ksi.ksi_signo = SIGSEGV;
                              ksi.ksi_code = SEGV_ACCERR;
                              break;
                      default:
                              KASSERT(0);
                              break;
                      }
                      goto trapsignal;
      
              case T_PRIVINFLT|T_USER:        /* privileged instruction fault */
              case T_FPOPFLT|T_USER:                /* coprocessor operand fault */
                      KSI_INIT_TRAP(&ksi);
                      ksi.ksi_signo = SIGILL;
                      ksi.ksi_trap = type & ~T_USER;
                      ksi.ksi_addr = (void *) frame->tf_rip;
                      switch (type) {
                      case T_PRIVINFLT|T_USER:
                              ksi.ksi_code = ILL_PRVOPC;
                              break;
                      case T_FPOPFLT|T_USER:
                              ksi.ksi_code = ILL_COPROC;
                              break;
                      default:
                              KASSERT(0);
                              break;
                      }
                      goto trapsignal;
      
              case T_ASTFLT|T_USER:
                      /* Allow process switch. */
                      //curcpu()->ci_data.cpu_nast++;
                      if (l->l_pflag & LP_OWEUPC) {
                              l->l_pflag &= ~LP_OWEUPC;
                              ADDUPROF(l);
                      }
                      /* Allow a forced task switch. */
                      if (curcpu()->ci_want_resched) {
                              preempt();
                      }
                      goto out;
      
              case T_BOUND|T_USER:
              case T_OFLOW|T_USER:
              case T_DIVIDE|T_USER:
                      KSI_INIT_TRAP(&ksi);
                      ksi.ksi_signo = SIGFPE;
                      ksi.ksi_trap = type & ~T_USER;
                      ksi.ksi_addr = (void *)frame->tf_rip;
                      switch (type) {
                      case T_BOUND|T_USER:
                              ksi.ksi_code = FPE_FLTSUB;
                              break;
                      case T_OFLOW|T_USER:
                              ksi.ksi_code = FPE_INTOVF;
                              break;
                      case T_DIVIDE|T_USER:
                              ksi.ksi_code = FPE_INTDIV;
                              break;
                      default:
      #ifdef DIAGNOSTIC
                              panic("unhandled type %x\n", type);
      #endif
                              break;
                      }
                      goto trapsignal;
      
              case T_PAGEFLT:
                      /* Allow page faults in kernel mode. */
    2                 if (__predict_false(l == NULL))
                              goto we_re_toast;
      
                      onfault = pcb->pcb_onfault;
      
    2                 if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
                              goto we_re_toast;
                      }
      
    2                 cr2 = rcr2();
      
                      if (frame->tf_err & PGEX_X) {
                              /* SMEP might have brought us here */
                              if (cr2 < VM_MAXUSER_ADDRESS) {
                                      printf("prevented execution of %p (SMEP)\n",
                                          (void *)cr2);
                                      goto we_re_toast;
                              }
                      }
      
    2                 if ((frame->tf_err & PGEX_P) &&
                          cr2 < VM_MAXUSER_ADDRESS) {
                              /* SMAP might have brought us here */
                              if (onfault_handler(pcb, frame) == NULL) {
                                      printf("prevented access to %p (SMAP)\n",
                                          (void *)cr2);
                                      goto we_re_toast;
                              }
                      }
      
                      goto faultcommon;
      
              case T_PAGEFLT|T_USER: {
                      register vaddr_t va;
                      register struct vmspace *vm;
                      register struct vm_map *map;
                      vm_prot_t ftype;
                      extern struct vm_map *kernel_map;
      
                      cr2 = rcr2();
                      if (p->p_emul->e_usertrap != NULL &&
                          (*p->p_emul->e_usertrap)(l, cr2, frame) != 0)
                              return;
    2 faultcommon:
                      vm = p->p_vmspace;
                      if (__predict_false(vm == NULL)) {
                              goto we_re_toast;
                      }
    2                 pcb->pcb_cr2 = cr2;
                      va = trunc_page((vaddr_t)cr2);
                      /*
                       * It is only a kernel address space fault iff:
                       *        1. (type & T_USER) == 0  and
                       *        2. pcb_onfault not set or
                       *        3. pcb_onfault set but supervisor space fault
                       * The last can occur during an exec() copyin where the
                       * argument space is lazy-allocated.
                       */
                      if (type == T_PAGEFLT && va >= VM_MIN_KERNEL_ADDRESS)
                              map = kernel_map;
                      else
    2                         map = &vm->vm_map;
                      if (frame->tf_err & PGEX_W)
                              ftype = VM_PROT_WRITE;
    2                 else if (frame->tf_err & PGEX_X)
                              ftype = VM_PROT_EXECUTE;
                      else
                              ftype = VM_PROT_READ;
      
      #ifdef DIAGNOSTIC
    2                 if (map == kernel_map && va == 0) {
                              printf("trap: bad kernel access at %lx\n", va);
                              goto we_re_toast;
                      }
      #endif
                      /* Fault the original page in. */
    2                 onfault = pcb->pcb_onfault;
                      pcb->pcb_onfault = NULL;
                      error = uvm_fault(map, va, ftype);
                      pcb->pcb_onfault = onfault;
                      if (error == 0) {
    1                         if (map != kernel_map && (void *)va >= vm->vm_maxsaddr)
                                      uvm_grow(p, va);
      
                              pfail = false;
    1                         while (type == T_PAGEFLT) {
                                      /*
                                       * we need to switch pmap now if we're in
                                       * the middle of copyin/out.
                                       *
                                       * but we don't need to do so for kcopy as
                                       * it never touch userspace.
                                        */
    1                                 kpreempt_disable();
                                      if (curcpu()->ci_want_pmapload) {
                                              onfault = onfault_handler(pcb, frame);
                                              if (onfault != kcopy_fault) {
                                                      pmap_load();
                                              }
                                      }
                                      /*
                                       * We need to keep the pmap loaded and
                                       * so avoid being preempted until back
                                       * into the copy functions.  Disable
                                       * interrupts at the hardware level before
                                       * re-enabling preemption.  Interrupts
                                       * will be re-enabled by 'iret' when
                                       * returning back out of the trap stub.
                                       * They'll only be re-enabled when the
                                       * program counter is once again in
                                       * the copy functions, and so visible
                                       * to cpu_kpreempt_exit().
                                       */
      #ifndef XENPV
    1                                 x86_disable_intr();
      #endif
                                      l->l_nopreempt--;
    1                                 if (l->l_nopreempt > 0 || !l->l_dopreempt ||
                                          pfail) {
                                              return;
                                      }
      #ifndef XENPV
                                      x86_enable_intr();
      #endif
                                      /*
                                       * If preemption fails for some reason,
                                       * don't retry it.  The conditions won't
                                       * change under our nose.
                                       */
                                      pfail = kpreempt(0);
                              }
                              goto out;
                      }
      
    1                 if (type == T_PAGEFLT) {
    1                         onfault = onfault_handler(pcb, frame);
                              if (onfault != NULL) {
    1                                 onfault_restore(frame, onfault, error);
                                      return;
                              }
      
                              printf("uvm_fault(%p, 0x%lx, %d) -> %x\n",
                                  map, va, ftype, error);
                              goto we_re_toast;
                      }
      
                      KSI_INIT_TRAP(&ksi);
                      ksi.ksi_trap = type & ~T_USER;
                      ksi.ksi_addr = (void *)cr2;
                      switch (error) {
                      case EINVAL:
                              ksi.ksi_signo = SIGBUS;
                              ksi.ksi_code = BUS_ADRERR;
                              break;
                      case EACCES:
                              ksi.ksi_signo = SIGSEGV;
                              ksi.ksi_code = SEGV_ACCERR;
                              error = EFAULT;
    1                         break;
                      case ENOMEM:
                              ksi.ksi_signo = SIGKILL;
                              printf("UVM: pid %d.%d (%s), uid %d killed: "
                                  "out of swap\n", p->p_pid, l->l_lid, p->p_comm,
                                  l->l_cred ?  kauth_cred_geteuid(l->l_cred) : -1);
                              break;
                      default:
                              ksi.ksi_signo = SIGSEGV;
                              ksi.ksi_code = SEGV_MAPERR;
                              break;
                      }
      
                      SIGDEBUG(frame, &ksi, error);
                       (*p->p_emul->e_trapsignal)(l, &ksi);
                      break;
              }
      
              case T_TRCTRAP:
                      /*
                       * Ignore debug register trace traps due to
                       * accesses in the user's address space, which
                       * can happen under several conditions such as
                       * if a user sets a watchpoint on a buffer and
                       * then passes that buffer to a system call.
                       * We still want to get TRCTRAPS for addresses
                       * in kernel space because that is useful when
                       * debugging the kernel.
                       */
                      if (x86_dbregs_user_trap())
                              break;
      
                      /* Check whether they single-stepped into a lcall. */
                      if (frame->tf_rip == (uint64_t)IDTVEC(osyscall) ||
                          frame->tf_rip == (uint64_t)IDTVEC(syscall32)) {
                              frame->tf_rflags &= ~PSL_T;
                              return;
                      }
                      goto we_re_toast;
      
              case T_BPTFLT|T_USER:                /* bpt instruction fault */
              case T_TRCTRAP|T_USER:                /* trace trap */
                      /*
                       * Don't go single-stepping into a RAS.
                       */
                      if (p->p_raslist == NULL ||
                          (ras_lookup(p, (void *)frame->tf_rip) == (void *)-1)) {
                              KSI_INIT_TRAP(&ksi);
                              ksi.ksi_signo = SIGTRAP;
                              ksi.ksi_trap = type & ~T_USER;
                              if (x86_dbregs_user_trap()) {
                                      x86_dbregs_store_dr6(l);
                                      ksi.ksi_code = TRAP_DBREG;
                              } else if (type == (T_BPTFLT|T_USER))
                                      ksi.ksi_code = TRAP_BRKPT;
                              else
                                      ksi.ksi_code = TRAP_TRACE;
                              (*p->p_emul->e_trapsignal)(l, &ksi);
                      }
                      break;
              }
      
              if ((type & T_USER) == 0)
                      return;
      out:
              userret(l);
              return;
      trapsignal:
              SIGDEBUG(frame, &ksi, 0);
              (*p->p_emul->e_trapsignal)(l, &ksi);
              userret(l);
      }
      
      /*
       * startlwp: start of a new LWP.
       */
      void
      startlwp(void *arg)
      {
              ucontext_t *uc = arg;
              lwp_t *l = curlwp;
              int error __diagused;
      
              error = cpu_setmcontext(l, &uc->uc_mcontext, uc->uc_flags);
              KASSERT(error == 0);
      
              kmem_free(uc, sizeof(ucontext_t));
              userret(l);
      }
      
      #ifdef TRAP_SIGDEBUG
      static void
      frame_dump(const struct trapframe *tf, struct pcb *pcb)
      {
      
              printf("trapframe %p\n", tf);
              printf("rip %#018lx  rsp %#018lx  rfl %#018lx\n",
                  tf->tf_rip, tf->tf_rsp, tf->tf_rflags);
              printf("rdi %#018lx  rsi %#018lx  rdx %#018lx\n",
                  tf->tf_rdi, tf->tf_rsi, tf->tf_rdx);
              printf("rcx %#018lx  r8  %#018lx  r9  %#018lx\n",
                  tf->tf_rcx, tf->tf_r8, tf->tf_r9);
              printf("r10 %#018lx  r11 %#018lx  r12 %#018lx\n",
                  tf->tf_r10, tf->tf_r11, tf->tf_r12);
              printf("r13 %#018lx  r14 %#018lx  r15 %#018lx\n",
                  tf->tf_r13, tf->tf_r14, tf->tf_r15);
              printf("rbp %#018lx  rbx %#018lx  rax %#018lx\n",
                  tf->tf_rbp, tf->tf_rbx, tf->tf_rax);
              printf("cs %#04lx  ds %#04lx  es %#04lx  "
                  "fs %#04lx  gs %#04lx  ss %#04lx\n",
                  tf->tf_cs & 0xffff, tf->tf_ds & 0xffff, tf->tf_es & 0xffff,
                  tf->tf_fs & 0xffff, tf->tf_gs & 0xffff, tf->tf_ss & 0xffff);
              printf("fsbase %#018lx gsbase %#018lx\n", pcb->pcb_fs, pcb->pcb_gs);
              printf("\n");
              hexdump(printf, "Stack dump", tf, 256);
      }
      
      static void
      sigdebug(const struct trapframe *tf, const ksiginfo_t *ksi, int e)
      {
              struct lwp *l = curlwp;
              struct proc *p = l->l_proc;
      
              printf("pid %d.%d (%s): signal %d code=%d (trap %#lx) "
                  "@rip %#lx addr %#lx error=%d\n",
                  p->p_pid, l->l_lid, p->p_comm, ksi->ksi_signo, ksi->ksi_code,
                  tf->tf_trapno, tf->tf_rip, rcr2(), e);
              frame_dump(tf, lwp_getpcb(l));
      }
      #endif
      /*        $NetBSD: kern_cpu.c,v 1.75 2018/11/13 11:06:19 skrll Exp $        */
      
      /*-
       * Copyright (c) 2007, 2008, 2009, 2010, 2012 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*-
       * Copyright (c)2007 YAMAMOTO Takashi,
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.75 2018/11/13 11:06:19 skrll Exp $");
      
      #include "opt_cpu_ucode.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/idle.h>
      #include <sys/sched.h>
      #include <sys/intr.h>
      #include <sys/conf.h>
      #include <sys/cpu.h>
      #include <sys/cpuio.h>
      #include <sys/proc.h>
      #include <sys/percpu.h>
      #include <sys/kernel.h>
      #include <sys/kauth.h>
      #include <sys/xcall.h>
      #include <sys/pool.h>
      #include <sys/kmem.h>
      #include <sys/select.h>
      #include <sys/namei.h>
      #include <sys/callout.h>
      #include <sys/pcu.h>
      
      #include <uvm/uvm_extern.h>
      
      #include "ioconf.h"
      
      /*
       * If the port has stated that cpu_data is the first thing in cpu_info,
       * verify that the claim is true. This will prevent them from getting out
       * of sync.
       */
      #ifdef __HAVE_CPU_DATA_FIRST
      CTASSERT(offsetof(struct cpu_info, ci_data) == 0);
      #else
      CTASSERT(offsetof(struct cpu_info, ci_data) != 0);
      #endif
      
      static void        cpu_xc_online(struct cpu_info *);
      static void        cpu_xc_offline(struct cpu_info *);
      
      dev_type_ioctl(cpuctl_ioctl);
      
      const struct cdevsw cpuctl_cdevsw = {
              .d_open = nullopen,
              .d_close = nullclose,
              .d_read = nullread,
              .d_write = nullwrite,
              .d_ioctl = cpuctl_ioctl,
              .d_stop = nullstop,
              .d_tty = notty,
              .d_poll = nopoll,
              .d_mmap = nommap,
              .d_kqfilter = nokqfilter,
              .d_discard = nodiscard,
              .d_flag = D_OTHER | D_MPSAFE
      };
      
      kmutex_t        cpu_lock                __cacheline_aligned;
      int                ncpu                        __read_mostly;
      int                ncpuonline                __read_mostly;
      bool                mp_online                __read_mostly;
      
      /* An array of CPUs.  There are ncpu entries. */
      struct cpu_info **cpu_infos                __read_mostly;
      
      /* Note: set on mi_cpu_attach() and idle_loop(). */
      kcpuset_t *        kcpuset_attached        __read_mostly        = NULL;
      kcpuset_t *        kcpuset_running                __read_mostly        = NULL;
      
      int (*compat_cpuctl_ioctl)(struct lwp *, u_long, void *) = (void *)enosys;
      
      static char cpu_model[128];
      
      /*
       * mi_cpu_init: early initialisation of MI CPU related structures.
       *
       * Note: may not block and memory allocator is not yet available.
       */
      void
      mi_cpu_init(void)
      {
      
              mutex_init(&cpu_lock, MUTEX_DEFAULT, IPL_NONE);
      
              kcpuset_create(&kcpuset_attached, true);
              kcpuset_create(&kcpuset_running, true);
              kcpuset_set(kcpuset_running, 0);
      }
      
      int
      mi_cpu_attach(struct cpu_info *ci)
      {
              int error;
      
              KASSERT(maxcpus > 0);
      
              ci->ci_index = ncpu;
              kcpuset_set(kcpuset_attached, cpu_index(ci));
      
              /*
               * Create a convenience cpuset of just ourselves.
               */
              kcpuset_create(&ci->ci_data.cpu_kcpuset, true);
              kcpuset_set(ci->ci_data.cpu_kcpuset, cpu_index(ci));
      
              TAILQ_INIT(&ci->ci_data.cpu_ld_locks);
              __cpu_simple_lock_init(&ci->ci_data.cpu_ld_lock);
      
              /* This is useful for eg, per-cpu evcnt */
              snprintf(ci->ci_data.cpu_name, sizeof(ci->ci_data.cpu_name), "cpu%d",
                  cpu_index(ci));
      
              if (__predict_false(cpu_infos == NULL)) {
                      size_t ci_bufsize = (maxcpus + 1) * sizeof(struct cpu_info *);
                      cpu_infos = kmem_zalloc(ci_bufsize, KM_SLEEP);
              }
              cpu_infos[cpu_index(ci)] = ci;
      
              sched_cpuattach(ci);
      
              error = create_idle_lwp(ci);
              if (error != 0) {
                      /* XXX revert sched_cpuattach */
                      return error;
              }
      
              if (ci == curcpu())
                      ci->ci_data.cpu_onproc = curlwp;
              else
                      ci->ci_data.cpu_onproc = ci->ci_data.cpu_idlelwp;
      
              percpu_init_cpu(ci);
              softint_init(ci);
              callout_init_cpu(ci);
              xc_init_cpu(ci);
              pool_cache_cpu_init(ci);
              selsysinit(ci);
              cache_cpu_init(ci);
              TAILQ_INIT(&ci->ci_data.cpu_biodone);
              ncpu++;
              ncpuonline++;
      
              return 0;
      }
      
      void
      cpuctlattach(int dummy __unused)
      {
      
              KASSERT(cpu_infos != NULL);
      }
      
      int
      cpuctl_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
      {
              CPU_INFO_ITERATOR cii;
              cpustate_t *cs;
              struct cpu_info *ci;
              int error, i;
              u_int id;
      
              error = 0;
      
              mutex_enter(&cpu_lock);
              switch (cmd) {
              case IOC_CPU_SETSTATE:
                      cs = data;
                      error = kauth_authorize_system(l->l_cred,
                          KAUTH_SYSTEM_CPU, KAUTH_REQ_SYSTEM_CPU_SETSTATE, cs, NULL,
                          NULL);
                      if (error != 0)
                              break;
                      if (cs->cs_id >= maxcpus ||
                          (ci = cpu_lookup(cs->cs_id)) == NULL) {
                              error = ESRCH;
                              break;
                      }
                      cpu_setintr(ci, cs->cs_intr);
                      error = cpu_setstate(ci, cs->cs_online);
                      break;
      
              case IOC_CPU_GETSTATE:
                      cs = data;
                      id = cs->cs_id;
                      memset(cs, 0, sizeof(*cs));
                      cs->cs_id = id;
                      if (cs->cs_id >= maxcpus ||
                          (ci = cpu_lookup(id)) == NULL) {
                              error = ESRCH;
                              break;
                      }
                      if ((ci->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
                              cs->cs_online = false;
                      else
                              cs->cs_online = true;
                      if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
                              cs->cs_intr = false;
                      else
                              cs->cs_intr = true;
                      cs->cs_lastmod = (int32_t)ci->ci_schedstate.spc_lastmod;
                      cs->cs_lastmodhi = (int32_t)
                          (ci->ci_schedstate.spc_lastmod >> 32);
                      cs->cs_intrcnt = cpu_intr_count(ci) + 1;
                      cs->cs_hwid = ci->ci_cpuid;
                      break;
      
              case IOC_CPU_MAPID:
                      i = 0;
                      for (CPU_INFO_FOREACH(cii, ci)) {
                              if (i++ == *(int *)data)
                                      break;
                      }
                      if (ci == NULL)
                              error = ESRCH;
                      else
                              *(int *)data = cpu_index(ci);
                      break;
      
              case IOC_CPU_GETCOUNT:
                      *(int *)data = ncpu;
                      break;
      
      #ifdef CPU_UCODE
              case IOC_CPU_UCODE_GET_VERSION:
                      error = cpu_ucode_get_version((struct cpu_ucode_version *)data);
                      break;
      
              case IOC_CPU_UCODE_APPLY:
                      error = kauth_authorize_machdep(l->l_cred,
                          KAUTH_MACHDEP_CPU_UCODE_APPLY,
                          NULL, NULL, NULL, NULL);
                      if (error != 0)
                              break;
                      error = cpu_ucode_apply((const struct cpu_ucode *)data);
                      break;
      #endif
      
              default:
                      error = (*compat_cpuctl_ioctl)(l, cmd, data);
                      break;
              }
              mutex_exit(&cpu_lock);
      
              return error;
      }
      
      struct cpu_info *
      cpu_lookup(u_int idx)
      {
              struct cpu_info *ci;
      
              /*
               * cpu_infos is a NULL terminated array of MAXCPUS + 1 entries,
               * so an index of MAXCPUS here is ok.  See mi_cpu_attach.
               */
              KASSERT(idx <= maxcpus);
      
              if (__predict_false(cpu_infos == NULL)) {
                      KASSERT(idx == 0);
                      return curcpu();
              }
      
              ci = cpu_infos[idx];
              KASSERT(ci == NULL || cpu_index(ci) == idx);
              KASSERTMSG(idx < maxcpus || ci == NULL, "idx %d ci %p", idx, ci);
      
              return ci;
      }
      
      static void
      cpu_xc_offline(struct cpu_info *ci)
      {
              struct schedstate_percpu *spc, *mspc = NULL;
              struct cpu_info *target_ci;
              struct lwp *l;
              CPU_INFO_ITERATOR cii;
              int s;
      
              /*
               * Thread that made the cross call (separate context) holds
               * cpu_lock on our behalf.
               */
              spc = &ci->ci_schedstate;
              s = splsched();
              spc->spc_flags |= SPCF_OFFLINE;
              splx(s);
      
              /* Take the first available CPU for the migration. */
              for (CPU_INFO_FOREACH(cii, target_ci)) {
                      mspc = &target_ci->ci_schedstate;
                      if ((mspc->spc_flags & SPCF_OFFLINE) == 0)
                              break;
              }
              KASSERT(target_ci != NULL);
      
              /*
               * Migrate all non-bound threads to the other CPU.  Note that this
               * runs from the xcall thread, thus handling of LSONPROC is not needed.
               */
              mutex_enter(proc_lock);
              LIST_FOREACH(l, &alllwp, l_list) {
                      struct cpu_info *mci;
      
                      lwp_lock(l);
                      if (l->l_cpu != ci || (l->l_pflag & (LP_BOUND | LP_INTR))) {
                              lwp_unlock(l);
                              continue;
                      }
                      /* Regular case - no affinity. */
                      if (l->l_affinity == NULL) {
                              lwp_migrate(l, target_ci);
                              continue;
                      }
                      /* Affinity is set, find an online CPU in the set. */
                      for (CPU_INFO_FOREACH(cii, mci)) {
                              mspc = &mci->ci_schedstate;
                              if ((mspc->spc_flags & SPCF_OFFLINE) == 0 &&
                                  kcpuset_isset(l->l_affinity, cpu_index(mci)))
                                      break;
                      }
                      if (mci == NULL) {
                              lwp_unlock(l);
                              mutex_exit(proc_lock);
                              goto fail;
                      }
                      lwp_migrate(l, mci);
              }
              mutex_exit(proc_lock);
      
      #if PCU_UNIT_COUNT > 0
              pcu_save_all_on_cpu();
      #endif
      
      #ifdef __HAVE_MD_CPU_OFFLINE
              cpu_offline_md();
      #endif
              return;
      fail:
              /* Just unset the SPCF_OFFLINE flag, caller will check */
              s = splsched();
              spc->spc_flags &= ~SPCF_OFFLINE;
              splx(s);
      }
      
      static void
      cpu_xc_online(struct cpu_info *ci)
      {
              struct schedstate_percpu *spc;
              int s;
      
              spc = &ci->ci_schedstate;
              s = splsched();
              spc->spc_flags &= ~SPCF_OFFLINE;
              splx(s);
      }
      
      int
      cpu_setstate(struct cpu_info *ci, bool online)
      {
              struct schedstate_percpu *spc;
              CPU_INFO_ITERATOR cii;
              struct cpu_info *ci2;
              uint64_t where;
              xcfunc_t func;
              int nonline;
      
              spc = &ci->ci_schedstate;
      
              KASSERT(mutex_owned(&cpu_lock));
      
              if (online) {
                      if ((spc->spc_flags & SPCF_OFFLINE) == 0)
                              return 0;
                      func = (xcfunc_t)cpu_xc_online;
              } else {
                      if ((spc->spc_flags & SPCF_OFFLINE) != 0)
                              return 0;
                      nonline = 0;
                      /*
                       * Ensure that at least one CPU within the processor set
                       * stays online.  Revisit this later.
                       */
                      for (CPU_INFO_FOREACH(cii, ci2)) {
                              if ((ci2->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
                                      continue;
                              if (ci2->ci_schedstate.spc_psid != spc->spc_psid)
                                      continue;
                              nonline++;
                      }
                      if (nonline == 1)
                              return EBUSY;
                      func = (xcfunc_t)cpu_xc_offline;
              }
      
              where = xc_unicast(0, func, ci, NULL, ci);
              xc_wait(where);
              if (online) {
                      KASSERT((spc->spc_flags & SPCF_OFFLINE) == 0);
                      ncpuonline++;
              } else {
                      if ((spc->spc_flags & SPCF_OFFLINE) == 0) {
                              /* If was not set offline, then it is busy */
                              return EBUSY;
                      }
                      ncpuonline--;
              }
      
              spc->spc_lastmod = time_second;
              return 0;
      }
      
      int
      cpu_setmodel(const char *fmt, ...)
      {
              int len;
              va_list ap;
      
              va_start(ap, fmt);
              len = vsnprintf(cpu_model, sizeof(cpu_model), fmt, ap);
              va_end(ap);
              return len;
      }
      
      const char *
      cpu_getmodel(void)
      {
              return cpu_model;
      }
      
      #ifdef __HAVE_INTR_CONTROL
      static void
      cpu_xc_intr(struct cpu_info *ci)
      {
              struct schedstate_percpu *spc;
              int s;
      
              spc = &ci->ci_schedstate;
              s = splsched();
              spc->spc_flags &= ~SPCF_NOINTR;
              splx(s);
      }
      
      static void
      cpu_xc_nointr(struct cpu_info *ci)
      {
              struct schedstate_percpu *spc;
              int s;
      
              spc = &ci->ci_schedstate;
              s = splsched();
              spc->spc_flags |= SPCF_NOINTR;
              splx(s);
      }
      
      int
      cpu_setintr(struct cpu_info *ci, bool intr)
      {
              struct schedstate_percpu *spc;
              CPU_INFO_ITERATOR cii;
              struct cpu_info *ci2;
              uint64_t where;
              xcfunc_t func;
              int nintr;
      
              spc = &ci->ci_schedstate;
      
              KASSERT(mutex_owned(&cpu_lock));
      
              if (intr) {
                      if ((spc->spc_flags & SPCF_NOINTR) == 0)
                              return 0;
                      func = (xcfunc_t)cpu_xc_intr;
              } else {
                      if ((spc->spc_flags & SPCF_NOINTR) != 0)
                              return 0;
                      /*
                       * Ensure that at least one CPU within the system
                       * is handing device interrupts.
                       */
                      nintr = 0;
                      for (CPU_INFO_FOREACH(cii, ci2)) {
                              if ((ci2->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
                                      continue;
                              if (ci2 == ci)
                                      continue;
                              nintr++;
                      }
                      if (nintr == 0)
                              return EBUSY;
                      func = (xcfunc_t)cpu_xc_nointr;
              }
      
              where = xc_unicast(0, func, ci, NULL, ci);
              xc_wait(where);
              if (intr) {
                      KASSERT((spc->spc_flags & SPCF_NOINTR) == 0);
              } else if ((spc->spc_flags & SPCF_NOINTR) == 0) {
                      /* If was not set offline, then it is busy */
                      return EBUSY;
              }
      
              /* Direct interrupts away from the CPU and record the change. */
              cpu_intr_redistribute();
              spc->spc_lastmod = time_second;
              return 0;
      }
      #else        /* __HAVE_INTR_CONTROL */
      int
      cpu_setintr(struct cpu_info *ci, bool intr)
      {
      
              return EOPNOTSUPP;
      }
      
      u_int
      cpu_intr_count(struct cpu_info *ci)
      {
      
              return 0;        /* 0 == "don't know" */
      }
      #endif        /* __HAVE_INTR_CONTROL */
      
      bool
      cpu_softintr_p(void)
      {
      
    4         return (curlwp->l_pflag & LP_INTR) != 0;
      }
      
      #ifdef CPU_UCODE
      int
      cpu_ucode_load(struct cpu_ucode_softc *sc, const char *fwname)
      {
              firmware_handle_t fwh;
              int error;
      
              if (sc->sc_blob != NULL) {
                      firmware_free(sc->sc_blob, sc->sc_blobsize);
                      sc->sc_blob = NULL;
                      sc->sc_blobsize = 0;
              }
      
              error = cpu_ucode_md_open(&fwh, sc->loader_version, fwname);
              if (error != 0) {
                      aprint_error("ucode: firmware_open failed: %i\n", error);
                      goto err0;
              }
      
              sc->sc_blobsize = firmware_get_size(fwh);
              if (sc->sc_blobsize == 0) {
                      error = EFTYPE;
                      firmware_close(fwh);
                      goto err0;
              }
              sc->sc_blob = firmware_malloc(sc->sc_blobsize);
              if (sc->sc_blob == NULL) {
                      error = ENOMEM;
                      firmware_close(fwh);
                      goto err0;
              }
      
              error = firmware_read(fwh, 0, sc->sc_blob, sc->sc_blobsize);
              firmware_close(fwh);
              if (error != 0)
                      goto err1;
      
              return 0;
      
      err1:
              firmware_free(sc->sc_blob, sc->sc_blobsize);
              sc->sc_blob = NULL;
              sc->sc_blobsize = 0;
      err0:
              return error;
      }
      #endif
      /* $NetBSD: kern_auth.c,v 1.77 2018/09/03 16:29:35 riastradh Exp $ */
      
      /*-
       * Copyright (c) 2005, 2006 Elad Efrat <elad@NetBSD.org>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. The name of the author may not be used to endorse or promote products
       *    derived from this software without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_auth.c,v 1.77 2018/09/03 16:29:35 riastradh Exp $");
      
      #include <sys/types.h>
      #include <sys/param.h>
      #include <sys/queue.h>
      #include <sys/proc.h>
      #include <sys/ucred.h>
      #include <sys/pool.h>
      #define __KAUTH_PRIVATE
      #include <sys/kauth.h>
      #include <sys/kmem.h>
      #include <sys/rwlock.h>
      #include <sys/sysctl.h>
      #include <sys/atomic.h>
      #include <sys/specificdata.h>
      #include <sys/vnode.h>
      
      #include <secmodel/secmodel.h>
      
      /*
       * Secmodel-specific credentials.
       */
      struct kauth_key {
              secmodel_t ks_secmodel;                /* secmodel */
              specificdata_key_t ks_key;        /* key */
      };
      
      
      /*
       * Listener.
       */
      struct kauth_listener {
              kauth_scope_callback_t                func;                /* callback */
              kauth_scope_t                        scope;                /* scope backpointer */
              u_int                                refcnt;                /* reference count */
              SIMPLEQ_ENTRY(kauth_listener)        listener_next;        /* listener list */
      };
      
      /*
       * Scope.
       */
      struct kauth_scope {
              const char                       *id;                /* scope name */
              void                               *cookie;                /* user cookie */
              u_int                                nlisteners;        /* # of listeners */
              SIMPLEQ_HEAD(, kauth_listener)        listenq;        /* listener list */
              SIMPLEQ_ENTRY(kauth_scope)        next_scope;        /* scope list */
      };
      
      static int kauth_cred_hook(kauth_cred_t, kauth_action_t, void *, void *);
      
      /* List of scopes and its lock. */
      static SIMPLEQ_HEAD(, kauth_scope) scope_list =
          SIMPLEQ_HEAD_INITIALIZER(scope_list);
      
      /* Built-in scopes: generic, process. */
      static kauth_scope_t kauth_builtin_scope_generic;
      static kauth_scope_t kauth_builtin_scope_system;
      static kauth_scope_t kauth_builtin_scope_process;
      static kauth_scope_t kauth_builtin_scope_network;
      static kauth_scope_t kauth_builtin_scope_machdep;
      static kauth_scope_t kauth_builtin_scope_device;
      static kauth_scope_t kauth_builtin_scope_cred;
      static kauth_scope_t kauth_builtin_scope_vnode;
      
      static specificdata_domain_t kauth_domain;
      static pool_cache_t kauth_cred_cache;
      
      krwlock_t        kauth_lock;
      
      /* Allocate new, empty kauth credentials. */
      kauth_cred_t
      kauth_cred_alloc(void)
      {
              kauth_cred_t cred;
      
    1         cred = pool_cache_get(kauth_cred_cache, PR_WAITOK);
      
              cred->cr_refcnt = 1;
              cred->cr_uid = 0;
              cred->cr_euid = 0;
              cred->cr_svuid = 0;
              cred->cr_gid = 0;
              cred->cr_egid = 0;
              cred->cr_svgid = 0;
              cred->cr_ngroups = 0;
      
              specificdata_init(kauth_domain, &cred->cr_sd);
              kauth_cred_hook(cred, KAUTH_CRED_INIT, NULL, NULL);
      
              return (cred);
      }
      
      /* Increment reference count to cred. */
      void
      kauth_cred_hold(kauth_cred_t cred)
      {
    1         KASSERT(cred != NULL);
    1         KASSERT(cred != NOCRED);
    1         KASSERT(cred != FSCRED);
    1         KASSERT(cred->cr_refcnt > 0);
      
    1         atomic_inc_uint(&cred->cr_refcnt);
      }
      
      /* Decrease reference count to cred. If reached zero, free it. */
      void
      kauth_cred_free(kauth_cred_t cred)
      {
      
    1         KASSERT(cred != NULL);
    1         KASSERT(cred != NOCRED);
    1         KASSERT(cred != FSCRED);
    1         KASSERT(cred->cr_refcnt > 0);
    1         ASSERT_SLEEPABLE();
      
              if (atomic_dec_uint_nv(&cred->cr_refcnt) > 0)
                      return;
      
              kauth_cred_hook(cred, KAUTH_CRED_FREE, NULL, NULL);
              specificdata_fini(kauth_domain, &cred->cr_sd);
              pool_cache_put(kauth_cred_cache, cred);
      }
      
      static void
      kauth_cred_clone1(kauth_cred_t from, kauth_cred_t to, bool copy_groups)
      {
    1         KASSERT(from != NULL);
    1         KASSERT(from != NOCRED);
    1         KASSERT(from != FSCRED);
    1         KASSERT(to != NULL);
    1         KASSERT(to != NOCRED);
    1         KASSERT(to != FSCRED);
    1         KASSERT(from->cr_refcnt > 0);
      
    1         to->cr_uid = from->cr_uid;
              to->cr_euid = from->cr_euid;
              to->cr_svuid = from->cr_svuid;
              to->cr_gid = from->cr_gid;
              to->cr_egid = from->cr_egid;
              to->cr_svgid = from->cr_svgid;
              if (copy_groups) {
                      to->cr_ngroups = from->cr_ngroups;
                      memcpy(to->cr_groups, from->cr_groups, sizeof(to->cr_groups));
              }
      
    1         kauth_cred_hook(from, KAUTH_CRED_COPY, to, NULL);
      }
      
      void
      kauth_cred_clone(kauth_cred_t from, kauth_cred_t to)
      {
              kauth_cred_clone1(from, to, true);
      }
      
      /*
       * Duplicate cred and return a new kauth_cred_t.
       */
      kauth_cred_t
      kauth_cred_dup(kauth_cred_t cred)
      {
              kauth_cred_t new_cred;
      
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt > 0);
      
              new_cred = kauth_cred_alloc();
      
              kauth_cred_clone(cred, new_cred);
      
              return (new_cred);
      }
      
      /*
       * Similar to crcopy(), only on a kauth_cred_t.
       * XXX: Is this even needed? [kauth_cred_copy]
       */
      kauth_cred_t
      kauth_cred_copy(kauth_cred_t cred)
      {
              kauth_cred_t new_cred;
      
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt > 0);
      
              /* If the provided credentials already have one reference, use them. */
              if (cred->cr_refcnt == 1)
                      return (cred);
      
              new_cred = kauth_cred_alloc();
      
              kauth_cred_clone(cred, new_cred);
      
              kauth_cred_free(cred);
      
              return (new_cred);
      }
      
      void
      kauth_proc_fork(struct proc *parent, struct proc *child)
      {
      
              mutex_enter(parent->p_lock);
              kauth_cred_hold(parent->p_cred);
              child->p_cred = parent->p_cred;
              mutex_exit(parent->p_lock);
      
              /* XXX: relies on parent process stalling during fork() */
              kauth_cred_hook(parent->p_cred, KAUTH_CRED_FORK, parent,
                  child);
      }
      
      void
      kauth_proc_chroot(kauth_cred_t cred, struct cwdinfo *cwdi)
      {
              kauth_cred_hook(cred, KAUTH_CRED_CHROOT, cwdi, NULL);
      }
      
      uid_t
      kauth_cred_getuid(kauth_cred_t cred)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
      
              return (cred->cr_uid);
      }
      
      uid_t
      kauth_cred_geteuid(kauth_cred_t cred)
      {
    3         KASSERT(cred != NULL);
    3         KASSERT(cred != NOCRED);
    3         KASSERT(cred != FSCRED);
      
    3         return (cred->cr_euid);
      }
      
      uid_t
      kauth_cred_getsvuid(kauth_cred_t cred)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
      
              return (cred->cr_svuid);
      }
      
      gid_t
      kauth_cred_getgid(kauth_cred_t cred)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
      
              return (cred->cr_gid);
      }
      
      gid_t
      kauth_cred_getegid(kauth_cred_t cred)
      {
    2         KASSERT(cred != NULL);
    2         KASSERT(cred != NOCRED);
    2         KASSERT(cred != FSCRED);
      
    2         return (cred->cr_egid);
      }
      
      gid_t
      kauth_cred_getsvgid(kauth_cred_t cred)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
      
              return (cred->cr_svgid);
      }
      
      void
      kauth_cred_setuid(kauth_cred_t cred, uid_t uid)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt == 1);
      
              cred->cr_uid = uid;
      }
      
      void
      kauth_cred_seteuid(kauth_cred_t cred, uid_t uid)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt == 1);
      
              cred->cr_euid = uid;
      }
      
      void
      kauth_cred_setsvuid(kauth_cred_t cred, uid_t uid)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt == 1);
      
              cred->cr_svuid = uid;
      }
      
      void
      kauth_cred_setgid(kauth_cred_t cred, gid_t gid)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt == 1);
      
              cred->cr_gid = gid;
      }
      
      void
      kauth_cred_setegid(kauth_cred_t cred, gid_t gid)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt == 1);
      
              cred->cr_egid = gid;
      }
      
      void
      kauth_cred_setsvgid(kauth_cred_t cred, gid_t gid)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(cred->cr_refcnt == 1);
      
              cred->cr_svgid = gid;
      }
      
      /* Checks if gid is a member of the groups in cred. */
      int
      kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp)
      {
              uint32_t i;
      
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(resultp != NULL);
      
              *resultp = 0;
      
              for (i = 0; i < cred->cr_ngroups; i++)
                      if (cred->cr_groups[i] == gid) {
                              *resultp = 1;
                              break;
                      }
      
              return (0);
      }
      
      u_int
      kauth_cred_ngroups(kauth_cred_t cred)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
      
              return (cred->cr_ngroups);
      }
      
      /*
       * Return the group at index idx from the groups in cred.
       */
      gid_t
      kauth_cred_group(kauth_cred_t cred, u_int idx)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(idx < cred->cr_ngroups);
      
              return (cred->cr_groups[idx]);
      }
      
      /* XXX elad: gmuid is unused for now. */
      int
      kauth_cred_setgroups(kauth_cred_t cred, const gid_t *grbuf, size_t len,
          uid_t gmuid, enum uio_seg seg)
      {
              int error = 0;
      
    1         KASSERT(cred != NULL);
    1         KASSERT(cred != NOCRED);
    1         KASSERT(cred != FSCRED);
    1         KASSERT(cred->cr_refcnt == 1);
      
    1         if (len > __arraycount(cred->cr_groups))
    1                 return EINVAL;
      
              if (len) {
                      if (seg == UIO_SYSSPACE) {
                              memcpy(cred->cr_groups, grbuf,
                                  len * sizeof(cred->cr_groups[0]));
                      } else {
    1                         error = copyin(grbuf, cred->cr_groups,
                                  len * sizeof(cred->cr_groups[0]));
                              if (error != 0)
                                      len = 0;
                      }
              }
    1         memset(cred->cr_groups + len, 0xff,
                  sizeof(cred->cr_groups) - (len * sizeof(cred->cr_groups[0])));
      
    1         cred->cr_ngroups = len;
      
              return error;
    1 }
      
      /* This supports sys_setgroups() */
      int
      kauth_proc_setgroups(struct lwp *l, kauth_cred_t ncred)
      {
              kauth_cred_t cred;
              int error;
      
              /*
               * At this point we could delete duplicate groups from ncred,
               * and plausibly sort the list - but in general the later is
               * a bad idea.
               */
    1         proc_crmod_enter();
              /* Maybe we should use curproc here ? */
              cred = l->l_proc->p_cred;
      
              kauth_cred_clone1(cred, ncred, false);
      
              error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID,
                  l->l_proc, NULL, NULL, NULL);
              if (error != 0) {
                      proc_crmod_leave(cred, ncred, false);
                              return error;
              }
      
              /* Broadcast our credentials to the process and other LWPs. */
    1          proc_crmod_leave(ncred, cred, true);
              return 0;
      }
      
      int
      kauth_cred_getgroups(kauth_cred_t cred, gid_t *grbuf, size_t len,
          enum uio_seg seg)
      {
              KASSERT(cred != NULL);
      
              if (len > cred->cr_ngroups)
                      return EINVAL;
      
              if (seg == UIO_USERSPACE)
                      return copyout(cred->cr_groups, grbuf, sizeof(*grbuf) * len);
              memcpy(grbuf, cred->cr_groups, sizeof(*grbuf) * len);
      
              return 0;
      }
      
      int
      kauth_register_key(secmodel_t secmodel, kauth_key_t *result)
      {
              kauth_key_t k;
              specificdata_key_t key;
              int error;
      
              KASSERT(result != NULL);
      
              error = specificdata_key_create(kauth_domain, &key, NULL);
              if (error)
                      return (error);
      
              k = kmem_alloc(sizeof(*k), KM_SLEEP);
              k->ks_secmodel = secmodel;
              k->ks_key = key;
      
              *result = k;
      
              return (0);
      }
      
      int
      kauth_deregister_key(kauth_key_t key)
      {
              KASSERT(key != NULL);
      
              specificdata_key_delete(kauth_domain, key->ks_key);
              kmem_free(key, sizeof(*key));
      
              return (0);
      }
      
      void *
      kauth_cred_getdata(kauth_cred_t cred, kauth_key_t key)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(key != NULL);
      
              return (specificdata_getspecific(kauth_domain, &cred->cr_sd,
                  key->ks_key));
      }
      
      void
      kauth_cred_setdata(kauth_cred_t cred, kauth_key_t key, void *data)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(key != NULL);
      
              specificdata_setspecific(kauth_domain, &cred->cr_sd, key->ks_key, data);
      }
      
      /*
       * Match uids in two credentials.
       */
      int
      kauth_cred_uidmatch(kauth_cred_t cred1, kauth_cred_t cred2)
      {
              KASSERT(cred1 != NULL);
              KASSERT(cred1 != NOCRED);
              KASSERT(cred1 != FSCRED);
              KASSERT(cred2 != NULL);
              KASSERT(cred2 != NOCRED);
              KASSERT(cred2 != FSCRED);
      
              if (cred1->cr_uid == cred2->cr_uid ||
                  cred1->cr_euid == cred2->cr_uid ||
                  cred1->cr_uid == cred2->cr_euid ||
                  cred1->cr_euid == cred2->cr_euid)
                      return (1);
      
              return (0);
      }
      
      u_int
      kauth_cred_getrefcnt(kauth_cred_t cred)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
      
              return (cred->cr_refcnt);
      }
      
      /*
       * Convert userland credentials (struct uucred) to kauth_cred_t.
       * XXX: For NFS & puffs
       */
      void    
      kauth_uucred_to_cred(kauth_cred_t cred, const struct uucred *uuc)
      {       
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(uuc != NULL);
       
              cred->cr_refcnt = 1;
              cred->cr_uid = uuc->cr_uid;
              cred->cr_euid = uuc->cr_uid;
              cred->cr_svuid = uuc->cr_uid;
              cred->cr_gid = uuc->cr_gid;
              cred->cr_egid = uuc->cr_gid;
              cred->cr_svgid = uuc->cr_gid;
              cred->cr_ngroups = uimin(uuc->cr_ngroups, NGROUPS);
              kauth_cred_setgroups(cred, __UNCONST(uuc->cr_groups),
                  cred->cr_ngroups, -1, UIO_SYSSPACE);
      }
      
      /*
       * Convert kauth_cred_t to userland credentials (struct uucred).
       * XXX: For NFS & puffs
       */
      void    
      kauth_cred_to_uucred(struct uucred *uuc, const kauth_cred_t cred)
      {       
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(uuc != NULL);
              int ng;
      
              ng = uimin(cred->cr_ngroups, NGROUPS);
              uuc->cr_uid = cred->cr_euid;  
              uuc->cr_gid = cred->cr_egid;  
              uuc->cr_ngroups = ng;
              kauth_cred_getgroups(cred, uuc->cr_groups, ng, UIO_SYSSPACE);
      }
      
      /*
       * Compare kauth_cred_t and uucred credentials.
       * XXX: Modelled after crcmp() for NFS.
       */
      int
      kauth_cred_uucmp(kauth_cred_t cred, const struct uucred *uuc)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(uuc != NULL);
      
              if (cred->cr_euid == uuc->cr_uid &&
                  cred->cr_egid == uuc->cr_gid &&
                  cred->cr_ngroups == (uint32_t)uuc->cr_ngroups) {
                      int i;
      
                      /* Check if all groups from uuc appear in cred. */
                      for (i = 0; i < uuc->cr_ngroups; i++) {
                              int ismember;
      
                              ismember = 0;
                              if (kauth_cred_ismember_gid(cred, uuc->cr_groups[i],
                                  &ismember) != 0 || !ismember)
                                      return (1);
                      }
      
                      return (0);
              }
      
              return (1);
      }
      
      /*
       * Make a struct ucred out of a kauth_cred_t.  For compatibility.
       */
      void
      kauth_cred_toucred(kauth_cred_t cred, struct ki_ucred *uc)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(uc != NULL);
      
              uc->cr_ref = cred->cr_refcnt;
              uc->cr_uid = cred->cr_euid;
              uc->cr_gid = cred->cr_egid;
              uc->cr_ngroups = uimin(cred->cr_ngroups, __arraycount(uc->cr_groups));
              memcpy(uc->cr_groups, cred->cr_groups,
                     uc->cr_ngroups * sizeof(uc->cr_groups[0]));
      }
      
      /*
       * Make a struct pcred out of a kauth_cred_t.  For compatibility.
       */
      void
      kauth_cred_topcred(kauth_cred_t cred, struct ki_pcred *pc)
      {
              KASSERT(cred != NULL);
              KASSERT(cred != NOCRED);
              KASSERT(cred != FSCRED);
              KASSERT(pc != NULL);
      
              pc->p_pad = NULL;
              pc->p_ruid = cred->cr_uid;
              pc->p_svuid = cred->cr_svuid;
              pc->p_rgid = cred->cr_gid;
              pc->p_svgid = cred->cr_svgid;
              pc->p_refcnt = cred->cr_refcnt;
      }
      
      /*
       * Return kauth_cred_t for the current LWP.
       */
      kauth_cred_t
      kauth_cred_get(void)
      {
              return (curlwp->l_cred);
      }
      
      /*
       * Returns a scope matching the provided id.
       * Requires the scope list lock to be held by the caller.
       */
      static kauth_scope_t
      kauth_ifindscope(const char *id)
      {
              kauth_scope_t scope;
      
              KASSERT(rw_lock_held(&kauth_lock));
      
              scope = NULL;
              SIMPLEQ_FOREACH(scope, &scope_list, next_scope) {
                      if (strcmp(scope->id, id) == 0)
                              break;
              }
      
              return (scope);
      }
      
      /*
       * Register a new scope.
       *
       * id - identifier for the scope
       * callback - the scope's default listener
       * cookie - cookie to be passed to the listener(s)
       */
      kauth_scope_t
      kauth_register_scope(const char *id, kauth_scope_callback_t callback,
          void *cookie)
      {
              kauth_scope_t scope;
              kauth_listener_t listener = NULL; /* XXX gcc */
      
              /* Sanitize input */
              if (id == NULL)
                      return (NULL);
      
              /* Allocate space for a new scope and listener. */
              scope = kmem_alloc(sizeof(*scope), KM_SLEEP);
              if (callback != NULL)
                      listener = kmem_alloc(sizeof(*listener), KM_SLEEP);
      
              /*
               * Acquire scope list lock.
               */
              rw_enter(&kauth_lock, RW_WRITER);
      
              /* Check we don't already have a scope with the same id */
              if (kauth_ifindscope(id) != NULL) {
                      rw_exit(&kauth_lock);
      
                      kmem_free(scope, sizeof(*scope));
                      if (callback != NULL)
                              kmem_free(listener, sizeof(*listener));
      
                      return (NULL);
              }
      
              /* Initialize new scope with parameters */
              scope->id = id;
              scope->cookie = cookie;
              scope->nlisteners = 1;
      
              SIMPLEQ_INIT(&scope->listenq);
      
              /* Add default listener */
              if (callback != NULL) {
                      listener->func = callback;
                      listener->scope = scope;
                      listener->refcnt = 0;
                      SIMPLEQ_INSERT_HEAD(&scope->listenq, listener, listener_next);
              }
      
              /* Insert scope to scopes list */
              SIMPLEQ_INSERT_TAIL(&scope_list, scope, next_scope);
      
              rw_exit(&kauth_lock);
      
              return (scope);
      }
      
      /*
       * Initialize the kernel authorization subsystem.
       *
       * Initialize the scopes list lock.
       * Create specificdata domain.
       * Register the credentials scope, used in kauth(9) internally.
       * Register built-in scopes: generic, system, process, network, machdep, device.
       */
      void
      kauth_init(void)
      {
              rw_init(&kauth_lock);
      
              kauth_cred_cache = pool_cache_init(sizeof(struct kauth_cred),
                  coherency_unit, 0, 0, "kcredpl", NULL, IPL_NONE,
                  NULL, NULL, NULL);
      
              /* Create specificdata domain. */
              kauth_domain = specificdata_domain_create();
      
              /* Register credentials scope. */
              kauth_builtin_scope_cred =
                  kauth_register_scope(KAUTH_SCOPE_CRED, NULL, NULL);
      
              /* Register generic scope. */
              kauth_builtin_scope_generic = kauth_register_scope(KAUTH_SCOPE_GENERIC,
                  NULL, NULL);
      
              /* Register system scope. */
              kauth_builtin_scope_system = kauth_register_scope(KAUTH_SCOPE_SYSTEM,
                  NULL, NULL);
      
              /* Register process scope. */
              kauth_builtin_scope_process = kauth_register_scope(KAUTH_SCOPE_PROCESS,
                  NULL, NULL);
      
              /* Register network scope. */
              kauth_builtin_scope_network = kauth_register_scope(KAUTH_SCOPE_NETWORK,
                  NULL, NULL);
      
              /* Register machdep scope. */
              kauth_builtin_scope_machdep = kauth_register_scope(KAUTH_SCOPE_MACHDEP,
                  NULL, NULL);
      
              /* Register device scope. */
              kauth_builtin_scope_device = kauth_register_scope(KAUTH_SCOPE_DEVICE,
                  NULL, NULL);
      
              /* Register vnode scope. */
              kauth_builtin_scope_vnode = kauth_register_scope(KAUTH_SCOPE_VNODE,
                  NULL, NULL);
      }
      
      /*
       * Deregister a scope.
       * Requires scope list lock to be held by the caller.
       *
       * scope - the scope to deregister
       */
      void
      kauth_deregister_scope(kauth_scope_t scope)
      {
              if (scope != NULL) {
                      /* Remove scope from list */
                      SIMPLEQ_REMOVE(&scope_list, scope, kauth_scope, next_scope);
                      kmem_free(scope, sizeof(*scope));
              }
      }
      
      /*
       * Register a listener.
       *
       * id - scope identifier.
       * callback - the callback routine for the listener.
       * cookie - cookie to pass unmoidfied to the callback.
       */
      kauth_listener_t
      kauth_listen_scope(const char *id, kauth_scope_callback_t callback,
         void *cookie)
      {
              kauth_scope_t scope;
              kauth_listener_t listener;
      
              listener = kmem_alloc(sizeof(*listener), KM_SLEEP);
              rw_enter(&kauth_lock, RW_WRITER);
      
              /*
               * Find scope struct.
               */
              scope = kauth_ifindscope(id);
              if (scope == NULL) {
                      rw_exit(&kauth_lock);
                      kmem_free(listener, sizeof(*listener));
                      return (NULL);
              }
      
              /* Allocate listener */
      
              /* Initialize listener with parameters */
              listener->func = callback;
              listener->refcnt = 0;
      
              /* Add listener to scope */
              SIMPLEQ_INSERT_TAIL(&scope->listenq, listener, listener_next);
      
              /* Raise number of listeners on scope. */
              scope->nlisteners++;
              listener->scope = scope;
      
              rw_exit(&kauth_lock);
      
              return (listener);
      }
      
      /*
       * Deregister a listener.
       *
       * listener - listener reference as returned from kauth_listen_scope().
       */
      void
      kauth_unlisten_scope(kauth_listener_t listener)
      {
      
              if (listener != NULL) {
                      rw_enter(&kauth_lock, RW_WRITER);
                      SIMPLEQ_REMOVE(&listener->scope->listenq, listener,
                          kauth_listener, listener_next);
                      listener->scope->nlisteners--;
                      rw_exit(&kauth_lock);
                      kmem_free(listener, sizeof(*listener));
              }
      }
      
      /*
       * Authorize a request.
       *
       * scope - the scope of the request as defined by KAUTH_SCOPE_* or as
       *           returned from kauth_register_scope().
       * credential - credentials of the user ("actor") making the request.
       * action - request identifier.
       * arg[0-3] - passed unmodified to listener(s).
       *
       * Returns the aggregated result:
       *     - KAUTH_RESULT_ALLOW if there is at least one KAUTH_RESULT_ALLOW and
       *       zero KAUTH_DESULT_DENY
       *     - KAUTH_RESULT_DENY if there is at least one KAUTH_RESULT_DENY
       *     - KAUTH_RESULT_DEFER if there is nothing but KAUTH_RESULT_DEFER
       */
      static int
      kauth_authorize_action_internal(kauth_scope_t scope, kauth_cred_t cred,
          kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3)
      {
              kauth_listener_t listener;
              int error, allow, fail;
      
    1         KASSERT(cred != NULL);
    1         KASSERT(action != 0);
      
              /* Short-circuit requests coming from the kernel. */
    1         if (cred == NOCRED || cred == FSCRED)
    1                 return KAUTH_RESULT_ALLOW;
      
    1         KASSERT(scope != NULL);
      
              fail = 0;
              allow = 0;
      
              /* rw_enter(&kauth_lock, RW_READER); XXX not yet */
    1         SIMPLEQ_FOREACH(listener, &scope->listenq, listener_next) {
    1                 error = listener->func(cred, action, scope->cookie, arg0,
                          arg1, arg2, arg3);
      
                      if (error == KAUTH_RESULT_ALLOW)
                              allow = 1;
    1                 else if (error == KAUTH_RESULT_DENY)
                              fail = 1;
              }
              /* rw_exit(&kauth_lock); */
      
    1         if (fail)
                      return (KAUTH_RESULT_DENY);
      
    1         if (allow)
                      return (KAUTH_RESULT_ALLOW);
      
              return (KAUTH_RESULT_DEFER);
      };
      
      int
      kauth_authorize_action(kauth_scope_t scope, kauth_cred_t cred,
          kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3)
      {
              int r;
      
    1         r = kauth_authorize_action_internal(scope, cred, action, arg0, arg1,
                  arg2, arg3);
      
              if (r == KAUTH_RESULT_DENY)
                      return (EPERM);
      
    1         if (r == KAUTH_RESULT_ALLOW)
                      return (0);
      
    1         if (secmodel_nsecmodels() == 0)
                      return (0);
      
              return (EPERM);
      }
      
      /*
       * Generic scope authorization wrapper.
       */
      int
      kauth_authorize_generic(kauth_cred_t cred, kauth_action_t action, void *arg0)
      {
              return (kauth_authorize_action(kauth_builtin_scope_generic, cred, 
                  action, arg0, NULL, NULL, NULL));
      }
      
      /*
       * System scope authorization wrapper.
       */
      int
      kauth_authorize_system(kauth_cred_t cred, kauth_action_t action,
          enum kauth_system_req req, void *arg1, void *arg2, void *arg3)
      {
              return (kauth_authorize_action(kauth_builtin_scope_system, cred,
                  action, (void *)req, arg1, arg2, arg3));
      }
      
      /*
       * Process scope authorization wrapper.
       */
      int
      kauth_authorize_process(kauth_cred_t cred, kauth_action_t action,
          struct proc *p, void *arg1, void *arg2, void *arg3)
      {
              return (kauth_authorize_action(kauth_builtin_scope_process, cred,
                  action, p, arg1, arg2, arg3));
      }
      
      /*
       * Network scope authorization wrapper.
       */
      int
      kauth_authorize_network(kauth_cred_t cred, kauth_action_t action,
          enum kauth_network_req req, void *arg1, void *arg2, void *arg3)
      {
              return (kauth_authorize_action(kauth_builtin_scope_network, cred,
                  action, (void *)req, arg1, arg2, arg3));
      }
      
      int
      kauth_authorize_machdep(kauth_cred_t cred, kauth_action_t action,
          void *arg0, void *arg1, void *arg2, void *arg3)
      {
              return (kauth_authorize_action(kauth_builtin_scope_machdep, cred,
                  action, arg0, arg1, arg2, arg3));
      }
      
      int
      kauth_authorize_device(kauth_cred_t cred, kauth_action_t action,
          void *arg0, void *arg1, void *arg2, void *arg3)
      {
              return (kauth_authorize_action(kauth_builtin_scope_device, cred,
                  action, arg0, arg1, arg2, arg3));
      }
      
      int
      kauth_authorize_device_tty(kauth_cred_t cred, kauth_action_t action,
          struct tty *tty)
      {
              return (kauth_authorize_action(kauth_builtin_scope_device, cred,
                  action, tty, NULL, NULL, NULL));
      }
      
      int
      kauth_authorize_device_spec(kauth_cred_t cred, enum kauth_device_req req,
          struct vnode *vp)
      {
              return (kauth_authorize_action(kauth_builtin_scope_device, cred,
                  KAUTH_DEVICE_RAWIO_SPEC, (void *)req, vp, NULL, NULL));
      }
      
      int
      kauth_authorize_device_passthru(kauth_cred_t cred, dev_t dev, u_long bits,
          void *data)
      {
              return (kauth_authorize_action(kauth_builtin_scope_device, cred,
                  KAUTH_DEVICE_RAWIO_PASSTHRU, (void *)bits, (void *)(u_long)dev,
                  data, NULL));
      }
      
      kauth_action_t
      kauth_mode_to_action(mode_t mode)
      {
              kauth_action_t action = 0;
      
              if (mode & VREAD)
                      action |= KAUTH_VNODE_READ_DATA;
              if (mode & VWRITE)
                      action |= KAUTH_VNODE_WRITE_DATA;
              if (mode & VEXEC)
                      action |= KAUTH_VNODE_EXECUTE;
      
              return action;
      }
      
      kauth_action_t
      kauth_extattr_action(mode_t access_mode)
      {
              kauth_action_t action = 0;
      
              if (access_mode & VREAD)
                      action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
              if (access_mode & VWRITE)
                      action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
      
              return action;
      }
      
      int
      kauth_authorize_vnode(kauth_cred_t cred, kauth_action_t action,
          struct vnode *vp, struct vnode *dvp, int fs_decision)
      {
              int error;
      
              error = kauth_authorize_action_internal(kauth_builtin_scope_vnode, cred,
                  action, vp, dvp, NULL, NULL);
      
              if (error == KAUTH_RESULT_DENY)
                      return (EACCES);
      
              if (error == KAUTH_RESULT_ALLOW)
                      return (0);
      
              /*
               * If the file-system does not support decision-before-action, we can
               * only short-circuit the operation (deny). If we're here, it means no
               * listener denied it, so our only alternative is to supposedly-allow
               * it and let the file-system have the last word.
               */
              if (fs_decision == KAUTH_VNODE_REMOTEFS)
                      return (0);
      
              return (fs_decision);
      }
      
      static int
      kauth_cred_hook(kauth_cred_t cred, kauth_action_t action, void *arg0,
          void *arg1)
      {
              int r;
      
    1         r = kauth_authorize_action(kauth_builtin_scope_cred, cred, action,
                  arg0, arg1, NULL, NULL);
      
      #ifdef DIAGNOSTIC
              if (!SIMPLEQ_EMPTY(&kauth_builtin_scope_cred->listenq))
                      KASSERT(r == 0);
      #endif /* DIAGNOSTIC */
      
    1         return (r);
      }
      /*        $NetBSD: kern_ktrace.c,v 1.173 2018/09/03 16:29:35 riastradh Exp $        */
      
      /*-
       * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Copyright (c) 1989, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)kern_ktrace.c        8.5 (Berkeley) 5/14/95
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_ktrace.c,v 1.173 2018/09/03 16:29:35 riastradh Exp $");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/proc.h>
      #include <sys/file.h>
      #include <sys/kernel.h>
      #include <sys/kthread.h>
      #include <sys/ktrace.h>
      #include <sys/kmem.h>
      #include <sys/syslog.h>
      #include <sys/filedesc.h>
      #include <sys/ioctl.h>
      #include <sys/callout.h>
      #include <sys/kauth.h>
      
      #include <sys/mount.h>
      #include <sys/syscallargs.h>
      
      /*
       * TODO:
       *        - need better error reporting?
       *        - userland utility to sort ktrace.out by timestamp.
       *        - keep minimum information in ktrace_entry when rest of alloc failed.
       *        - per trace control of configurable parameters.
       */
      
      struct ktrace_entry {
              TAILQ_ENTRY(ktrace_entry) kte_list;
              struct        ktr_header kte_kth;
              void        *kte_buf;
              size_t        kte_bufsz;
      #define        KTE_SPACE                32
              uint8_t kte_space[KTE_SPACE] __aligned(sizeof(register_t));
      };
      
      struct ktr_desc {
              TAILQ_ENTRY(ktr_desc) ktd_list;
              int ktd_flags;
      #define        KTDF_WAIT                0x0001
      #define        KTDF_DONE                0x0002
      #define        KTDF_BLOCKING                0x0004
      #define        KTDF_INTERACTIVE        0x0008
              int ktd_error;
      #define        KTDE_ENOMEM                0x0001
      #define        KTDE_ENOSPC                0x0002
              int ktd_errcnt;
              int ktd_ref;                        /* # of reference */
              int ktd_qcount;                        /* # of entry in the queue */
      
              /*
               * Params to control behaviour.
               */
              int ktd_delayqcnt;                /* # of entry allowed to delay */
              int ktd_wakedelay;                /* delay of wakeup in *tick* */
              int ktd_intrwakdl;                /* ditto, but when interactive */
      
              file_t *ktd_fp;                        /* trace output file */
              lwp_t *ktd_lwp;                        /* our kernel thread */
              TAILQ_HEAD(, ktrace_entry) ktd_queue;
              callout_t ktd_wakch;                /* delayed wakeup */
              kcondvar_t ktd_sync_cv;
              kcondvar_t ktd_cv;
      };
      
      static void        ktrwrite(struct ktr_desc *, struct ktrace_entry *);
      static int        ktrops(lwp_t *, struct proc *, int, int,
                          struct ktr_desc *);
      static int        ktrsetchildren(lwp_t *, struct proc *, int, int,
                          struct ktr_desc *);
      static int        ktrcanset(lwp_t *, struct proc *);
      static int        ktrsamefile(file_t *, file_t *);
      static void        ktr_kmem(lwp_t *, int, const void *, size_t);
      static void        ktr_io(lwp_t *, int, enum uio_rw, struct iovec *, size_t);
      
      static struct ktr_desc *
                      ktd_lookup(file_t *);
      static void        ktdrel(struct ktr_desc *);
      static void        ktdref(struct ktr_desc *);
      static void        ktefree(struct ktrace_entry *);
      static void        ktd_logerrl(struct ktr_desc *, int);
      static void        ktrace_thread(void *);
      static int        ktrderefall(struct ktr_desc *, int);
      
      /*
       * Default vaules.
       */
      #define        KTD_MAXENTRY                1000        /* XXX: tune */
      #define        KTD_TIMEOUT                5        /* XXX: tune */
      #define        KTD_DELAYQCNT                100        /* XXX: tune */
      #define        KTD_WAKEDELAY                5000        /* XXX: tune */
      #define        KTD_INTRWAKDL                100        /* XXX: tune */
      
      /*
       * Patchable variables.
       */
      int ktd_maxentry = KTD_MAXENTRY;        /* max # of entry in the queue */
      int ktd_timeout = KTD_TIMEOUT;                /* timeout in seconds */
      int ktd_delayqcnt = KTD_DELAYQCNT;        /* # of entry allowed to delay */
      int ktd_wakedelay = KTD_WAKEDELAY;        /* delay of wakeup in *ms* */
      int ktd_intrwakdl = KTD_INTRWAKDL;        /* ditto, but when interactive */
      
      kmutex_t ktrace_lock;
      int ktrace_on;
      static TAILQ_HEAD(, ktr_desc) ktdq = TAILQ_HEAD_INITIALIZER(ktdq);
      static pool_cache_t kte_cache;
      
      static kauth_listener_t ktrace_listener;
      
      static void
      ktd_wakeup(struct ktr_desc *ktd)
      {
      
              callout_stop(&ktd->ktd_wakch);
              cv_signal(&ktd->ktd_cv);
      }
      
      static void
      ktd_callout(void *arg)
      {
      
              mutex_enter(&ktrace_lock);
              ktd_wakeup(arg);
              mutex_exit(&ktrace_lock);
      }
      
      static void
      ktd_logerrl(struct ktr_desc *ktd, int error)
      {
      
              ktd->ktd_error |= error;
              ktd->ktd_errcnt++;
      }
      
      #if 0
      static void
      ktd_logerr(struct proc *p, int error)
      {
              struct ktr_desc *ktd;
      
              KASSERT(mutex_owned(&ktrace_lock));
      
              ktd = p->p_tracep;
              if (ktd == NULL)
                      return;
      
              ktd_logerrl(ktd, error);
      }
      #endif
      
      static int
      ktrace_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
          void *arg0, void *arg1, void *arg2, void *arg3)
      {
              struct proc *p;
              int result;
              enum kauth_process_req req;
      
              result = KAUTH_RESULT_DEFER;
              p = arg0;
      
    1         if (action != KAUTH_PROCESS_KTRACE)
    1                 return result;
      
              req = (enum kauth_process_req)(unsigned long)arg1;
      
              /* Privileged; secmodel should handle these. */
              if (req == KAUTH_REQ_PROCESS_KTRACE_PERSISTENT)
                      return result;
      
              if ((p->p_traceflag & KTRFAC_PERSISTENT) ||
                  (p->p_flag & PK_SUGID))
                      return result;
      
              if (kauth_cred_geteuid(cred) == kauth_cred_getuid(p->p_cred) &&
                  kauth_cred_getuid(cred) == kauth_cred_getsvuid(p->p_cred) &&
                  kauth_cred_getgid(cred) == kauth_cred_getgid(p->p_cred) &&
                  kauth_cred_getgid(cred) == kauth_cred_getsvgid(p->p_cred))
                      result = KAUTH_RESULT_ALLOW;
      
              return result;
      }
      
      /*
       * Initialise the ktrace system.
       */
      void
      ktrinit(void)
      {
      
              mutex_init(&ktrace_lock, MUTEX_DEFAULT, IPL_NONE);
              kte_cache = pool_cache_init(sizeof(struct ktrace_entry), 0, 0, 0,
                  "ktrace", &pool_allocator_nointr, IPL_NONE, NULL, NULL, NULL);
      
              ktrace_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
                  ktrace_listener_cb, NULL);
      }
      
      /*
       * Release a reference.  Called with ktrace_lock held.
       */
      void
      ktdrel(struct ktr_desc *ktd)
      {
      
              KASSERT(mutex_owned(&ktrace_lock));
      
              KDASSERT(ktd->ktd_ref != 0);
              KASSERT(ktd->ktd_ref > 0);
              KASSERT(ktrace_on > 0);
              ktrace_on--;
              if (--ktd->ktd_ref <= 0) {
                      ktd->ktd_flags |= KTDF_DONE;
                      cv_signal(&ktd->ktd_cv);
              }
      }
      
      void
      ktdref(struct ktr_desc *ktd)
      {
      
              KASSERT(mutex_owned(&ktrace_lock));
      
              ktd->ktd_ref++;
              ktrace_on++;
      }
      
      struct ktr_desc *
      ktd_lookup(file_t *fp)
      {
              struct ktr_desc *ktd;
      
              KASSERT(mutex_owned(&ktrace_lock));
      
              for (ktd = TAILQ_FIRST(&ktdq); ktd != NULL;
                  ktd = TAILQ_NEXT(ktd, ktd_list)) {
                      if (ktrsamefile(ktd->ktd_fp, fp)) {
                              ktdref(ktd);
                              break;
                      }
              }
      
              return (ktd);
      }
      
      void
      ktraddentry(lwp_t *l, struct ktrace_entry *kte, int flags)
      {
              struct proc *p = l->l_proc;
              struct ktr_desc *ktd;
      #ifdef DEBUG
              struct timeval t1, t2;
      #endif
      
              mutex_enter(&ktrace_lock);
      
              if (p->p_traceflag & KTRFAC_TRC_EMUL) {
                      /* Add emulation trace before first entry for this process */
                      p->p_traceflag &= ~KTRFAC_TRC_EMUL;
                      mutex_exit(&ktrace_lock);
                      ktrexit(l);
                      ktremul();
                      (void)ktrenter(l);
                      mutex_enter(&ktrace_lock);
              }
      
              /* Tracing may have been cancelled. */
              ktd = p->p_tracep;
              if (ktd == NULL)
                      goto freekte;
      
              /*
               * Bump reference count so that the object will remain while
               * we are here.  Note that the trace is controlled by other
               * process.
               */
              ktdref(ktd);
      
              if (ktd->ktd_flags & KTDF_DONE)
                      goto relktd;
      
              if (ktd->ktd_qcount > ktd_maxentry) {
                      ktd_logerrl(ktd, KTDE_ENOSPC);
                      goto relktd;
              }
              TAILQ_INSERT_TAIL(&ktd->ktd_queue, kte, kte_list);
              ktd->ktd_qcount++;
              if (ktd->ktd_flags & KTDF_BLOCKING)
                      goto skip_sync;
      
              if (flags & KTA_WAITOK &&
                  (/* flags & KTA_LARGE */0 || ktd->ktd_flags & KTDF_WAIT ||
                  ktd->ktd_qcount > ktd_maxentry >> 1))
                      /*
                       * Sync with writer thread since we're requesting rather
                       * big one or many requests are pending.
                       */
                      do {
                              ktd->ktd_flags |= KTDF_WAIT;
                              ktd_wakeup(ktd);
      #ifdef DEBUG
                              getmicrouptime(&t1);
      #endif
                              if (cv_timedwait(&ktd->ktd_sync_cv, &ktrace_lock,
                                  ktd_timeout * hz) != 0) {
                                      ktd->ktd_flags |= KTDF_BLOCKING;
                                      /*
                                       * Maybe the writer thread is blocking
                                       * completely for some reason, but
                                       * don't stop target process forever.
                                       */
                                      log(LOG_NOTICE, "ktrace timeout\n");
                                      break;
                              }
      #ifdef DEBUG
                              getmicrouptime(&t2);
                              timersub(&t2, &t1, &t2);
                              if (t2.tv_sec > 0)
                                      log(LOG_NOTICE,
                                          "ktrace long wait: %lld.%06ld\n",
                                          (long long)t2.tv_sec, (long)t2.tv_usec);
      #endif
                      } while (p->p_tracep == ktd &&
                          (ktd->ktd_flags & (KTDF_WAIT | KTDF_DONE)) == KTDF_WAIT);
              else {
                      /* Schedule delayed wakeup */
                      if (ktd->ktd_qcount > ktd->ktd_delayqcnt)
                              ktd_wakeup(ktd);        /* Wakeup now */
                      else if (!callout_pending(&ktd->ktd_wakch))
                              callout_reset(&ktd->ktd_wakch,
                                  ktd->ktd_flags & KTDF_INTERACTIVE ?
                                  ktd->ktd_intrwakdl : ktd->ktd_wakedelay,
                                  ktd_callout, ktd);
              }
      
      skip_sync:
              ktdrel(ktd);
              mutex_exit(&ktrace_lock);
              ktrexit(l);
              return;
      
      relktd:
              ktdrel(ktd);
      
      freekte:
              mutex_exit(&ktrace_lock);
              ktefree(kte);
              ktrexit(l);
      }
      
      void
      ktefree(struct ktrace_entry *kte)
      {
      
              if (kte->kte_buf != kte->kte_space)
                      kmem_free(kte->kte_buf, kte->kte_bufsz);
              pool_cache_put(kte_cache, kte);
      }
      
      /*
       * "deep" compare of two files for the purposes of clearing a trace.
       * Returns true if they're the same open file, or if they point at the
       * same underlying vnode/socket.
       */
      
      int
      ktrsamefile(file_t *f1, file_t *f2)
      {
      
              return ((f1 == f2) ||
                  ((f1 != NULL) && (f2 != NULL) &&
                      (f1->f_type == f2->f_type) &&
                      (f1->f_data == f2->f_data)));
      }
      
      void
      ktrderef(struct proc *p)
      {
              struct ktr_desc *ktd = p->p_tracep;
      
              KASSERT(mutex_owned(&ktrace_lock));
      
              p->p_traceflag = 0;
              if (ktd == NULL)
                      return;
              p->p_tracep = NULL;
      
              cv_broadcast(&ktd->ktd_sync_cv);
              ktdrel(ktd);
      }
      
      void
      ktradref(struct proc *p)
      {
              struct ktr_desc *ktd = p->p_tracep;
      
              KASSERT(mutex_owned(&ktrace_lock));
      
              ktdref(ktd);
      }
      
      int
      ktrderefall(struct ktr_desc *ktd, int auth)
      {
              lwp_t *curl = curlwp;
              struct proc *p;
              int error = 0;
      
              mutex_enter(proc_lock);
              PROCLIST_FOREACH(p, &allproc) {
                      if (p->p_tracep != ktd)
                              continue;
                      mutex_enter(p->p_lock);
                      mutex_enter(&ktrace_lock);
                      if (p->p_tracep == ktd) {
                              if (!auth || ktrcanset(curl, p))
                                      ktrderef(p);
                              else
                                      error = EPERM;
                      }
                      mutex_exit(&ktrace_lock);
                      mutex_exit(p->p_lock);
              }
              mutex_exit(proc_lock);
      
              return error;
      }
      
      int
      ktealloc(struct ktrace_entry **ktep, void **bufp, lwp_t *l, int type,
               size_t sz)
      {
              struct proc *p = l->l_proc;
              struct ktrace_entry *kte;
              struct ktr_header *kth;
              void *buf;
      
              if (ktrenter(l))
                      return EAGAIN;
      
              kte = pool_cache_get(kte_cache, PR_WAITOK);
              if (sz > sizeof(kte->kte_space)) {
                      buf = kmem_alloc(sz, KM_SLEEP);
              } else
                      buf = kte->kte_space;
      
              kte->kte_bufsz = sz;
              kte->kte_buf = buf;
      
              kth = &kte->kte_kth;
              (void)memset(kth, 0, sizeof(*kth));
              kth->ktr_len = sz;
              kth->ktr_type = type;
              kth->ktr_pid = p->p_pid;
              memcpy(kth->ktr_comm, p->p_comm, MAXCOMLEN);
              kth->ktr_version = KTRFAC_VERSION(p->p_traceflag);
              kth->ktr_lid = l->l_lid;
              nanotime(&kth->ktr_ts);
      
              *ktep = kte;
              *bufp = buf;
      
              return 0;
      }
      
      void
      ktesethdrlen(struct ktrace_entry *kte, size_t l)
      {        
              kte->kte_kth.ktr_len = l;
      }
      
      void
      ktr_syscall(register_t code, const register_t args[], int narg)
      {
              lwp_t *l = curlwp;
              struct proc *p = l->l_proc;
              struct ktrace_entry *kte;
              struct ktr_syscall *ktp;
              register_t *argp;
              size_t len;
              u_int i;
      
              if (!KTRPOINT(p, KTR_SYSCALL))
                      return;
      
              len = sizeof(struct ktr_syscall) + narg * sizeof argp[0];
      
              if (ktealloc(&kte, (void *)&ktp, l, KTR_SYSCALL, len))
                      return;
      
              ktp->ktr_code = code;
              ktp->ktr_argsize = narg * sizeof argp[0];
              argp = (register_t *)(ktp + 1);
              for (i = 0; i < narg; i++)
                      *argp++ = args[i];
      
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      void
      ktr_sysret(register_t code, int error, register_t *retval)
      {
              lwp_t *l = curlwp;
              struct ktrace_entry *kte;
              struct ktr_sysret *ktp;
      
              if (!KTRPOINT(l->l_proc, KTR_SYSRET))
                      return;
      
              if (ktealloc(&kte, (void *)&ktp, l, KTR_SYSRET,
                  sizeof(struct ktr_sysret)))
                      return;
      
              ktp->ktr_code = code;
              ktp->ktr_eosys = 0;                        /* XXX unused */
              ktp->ktr_error = error;
              ktp->ktr_retval = retval && error == 0 ? retval[0] : 0;
              ktp->ktr_retval_1 = retval && error == 0 ? retval[1] : 0;
      
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      void
      ktr_namei(const char *path, size_t pathlen)
      {
              lwp_t *l = curlwp;
      
              if (!KTRPOINT(l->l_proc, KTR_NAMEI))
                      return;
      
              ktr_kmem(l, KTR_NAMEI, path, pathlen);
      }
      
      void
      ktr_namei2(const char *eroot, size_t erootlen,
                const char *path, size_t pathlen)
      {
              lwp_t *l = curlwp;
              struct ktrace_entry *kte;
              void *buf;
      
              if (!KTRPOINT(l->l_proc, KTR_NAMEI))
                      return;
      
              if (ktealloc(&kte, &buf, l, KTR_NAMEI, erootlen + pathlen))
                      return;
              memcpy(buf, eroot, erootlen);
              buf = (char *)buf + erootlen;
              memcpy(buf, path, pathlen);
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      void
      ktr_emul(void)
      {
              lwp_t *l = curlwp;
              const char *emul = l->l_proc->p_emul->e_name;
      
              if (!KTRPOINT(l->l_proc, KTR_EMUL))
                      return;
      
              ktr_kmem(l, KTR_EMUL, emul, strlen(emul));
      }
      
      void
      ktr_execarg(const void *bf, size_t len)
      {
              lwp_t *l = curlwp;
      
              if (!KTRPOINT(l->l_proc, KTR_EXEC_ARG))
                      return;
      
              ktr_kmem(l, KTR_EXEC_ARG, bf, len);
      }
      
      void
      ktr_execenv(const void *bf, size_t len)
      {
              lwp_t *l = curlwp;
      
              if (!KTRPOINT(l->l_proc, KTR_EXEC_ENV))
                      return;
      
              ktr_kmem(l, KTR_EXEC_ENV, bf, len);
      }
      
      void
      ktr_execfd(int fd, u_int dtype)
      {
              struct ktrace_entry *kte;
              struct ktr_execfd* ktp;
      
              lwp_t *l = curlwp;
      
              if (!KTRPOINT(l->l_proc, KTR_EXEC_FD))
                      return;
      
              if (ktealloc(&kte, (void *)&ktp, l, KTR_EXEC_FD, sizeof(*ktp)))
                      return;
      
              ktp->ktr_fd = fd;
              ktp->ktr_dtype = dtype;
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      static void
      ktr_kmem(lwp_t *l, int type, const void *bf, size_t len)
      {
              struct ktrace_entry *kte;
              void *buf;
      
              if (ktealloc(&kte, &buf, l, type, len))
                      return;
              memcpy(buf, bf, len);
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      static void
      ktr_io(lwp_t *l, int fd, enum uio_rw rw, struct iovec *iov, size_t len)
      {
              struct ktrace_entry *kte;
              struct ktr_genio *ktp;
              size_t resid = len, cnt, buflen;
              char *cp;
      
       next:
              buflen = uimin(PAGE_SIZE, resid + sizeof(struct ktr_genio));
      
              if (ktealloc(&kte, (void *)&ktp, l, KTR_GENIO, buflen))
                      return;
      
              ktp->ktr_fd = fd;
              ktp->ktr_rw = rw;
      
              cp = (void *)(ktp + 1);
              buflen -= sizeof(struct ktr_genio);
              kte->kte_kth.ktr_len = sizeof(struct ktr_genio);
      
              while (buflen > 0) {
                      cnt = uimin(iov->iov_len, buflen);
                      if (copyin(iov->iov_base, cp, cnt) != 0)
                              goto out;
                      kte->kte_kth.ktr_len += cnt;
                      cp += cnt;
                      buflen -= cnt;
                      resid -= cnt;
                      iov->iov_len -= cnt;
                      if (iov->iov_len == 0)
                              iov++;
                      else
                              iov->iov_base = (char *)iov->iov_base + cnt;
              }
      
              /*
               * Don't push so many entry at once.  It will cause kmem map
               * shortage.
               */
              ktraddentry(l, kte, KTA_WAITOK | KTA_LARGE);
              if (resid > 0) {
                      if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) {
                              (void)ktrenter(l);
                              preempt();
                              ktrexit(l);
                      }
      
                      goto next;
              }
      
              return;
      
      out:
              ktefree(kte);
              ktrexit(l);
      }
      
      void
      ktr_genio(int fd, enum uio_rw rw, const void *addr, size_t len, int error)
      {
              lwp_t *l = curlwp;
              struct iovec iov;
      
              if (!KTRPOINT(l->l_proc, KTR_GENIO) || error != 0)
                      return;
              iov.iov_base = __UNCONST(addr);
              iov.iov_len = len;
              ktr_io(l, fd, rw, &iov, len);
      }
      
      void
      ktr_geniov(int fd, enum uio_rw rw, struct iovec *iov, size_t len, int error)
      {
              lwp_t *l = curlwp;
      
              if (!KTRPOINT(l->l_proc, KTR_GENIO) || error != 0)
                      return;
              ktr_io(l, fd, rw, iov, len);
      }
      
      void
      ktr_mibio(int fd, enum uio_rw rw, const void *addr, size_t len, int error)
      {
              lwp_t *l = curlwp;
              struct iovec iov;
      
              if (!KTRPOINT(l->l_proc, KTR_MIB) || error != 0)
                      return;
              iov.iov_base = __UNCONST(addr);
              iov.iov_len = len;
              ktr_io(l, fd, rw, &iov, len);
      }
      
      void
      ktr_psig(int sig, sig_t action, const sigset_t *mask,
               const ksiginfo_t *ksi)
      {
              struct ktrace_entry *kte;
              lwp_t *l = curlwp;
              struct {
                      struct ktr_psig        kp;
                      siginfo_t        si;
              } *kbuf;
      
              if (!KTRPOINT(l->l_proc, KTR_PSIG))
                      return;
      
              if (ktealloc(&kte, (void *)&kbuf, l, KTR_PSIG, sizeof(*kbuf)))
                      return;
      
              kbuf->kp.signo = (char)sig;
              kbuf->kp.action = action;
              kbuf->kp.mask = *mask;
      
              if (ksi) {
                      kbuf->kp.code = KSI_TRAPCODE(ksi);
                      (void)memset(&kbuf->si, 0, sizeof(kbuf->si));
                      kbuf->si._info = ksi->ksi_info;
                      kte->kte_kth.ktr_len = sizeof(*kbuf);
              } else {
                      kbuf->kp.code = 0;
                      kte->kte_kth.ktr_len = sizeof(struct ktr_psig);
              }
      
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      void
      ktr_csw(int out, int user)
      {
              lwp_t *l = curlwp;
              struct proc *p = l->l_proc;
              struct ktrace_entry *kte;
              struct ktr_csw *kc;
      
              if (!KTRPOINT(p, KTR_CSW))
                      return;
      
              /*
               * Don't record context switches resulting from blocking on 
               * locks; it's too easy to get duff results.
               */
              if (l->l_syncobj == &mutex_syncobj || l->l_syncobj == &rw_syncobj)
                      return;
      
              /*
               * We can't sleep if we're already going to sleep (if original
               * condition is met during sleep, we hang up).
               *
               * XXX This is not ideal: it would be better to maintain a pool
               * of ktes and actually push this to the kthread when context
               * switch happens, however given the points where we are called
               * from that is difficult to do. 
               */
              if (out) {
                      if (ktrenter(l))
                              return;
      
                      nanotime(&l->l_ktrcsw);
                      l->l_pflag |= LP_KTRCSW;
                      if (user)
                              l->l_pflag |= LP_KTRCSWUSER;
                      else
                              l->l_pflag &= ~LP_KTRCSWUSER;
      
                      ktrexit(l);
                      return;
              }
      
              /*
               * On the way back in, we need to record twice: once for entry, and
               * once for exit.
               */
              if ((l->l_pflag & LP_KTRCSW) != 0) {
                      struct timespec *ts;
                      l->l_pflag &= ~LP_KTRCSW;
      
                      if (ktealloc(&kte, (void *)&kc, l, KTR_CSW, sizeof(*kc)))
                              return;
      
                      kc->out = 1;
                      kc->user = ((l->l_pflag & LP_KTRCSWUSER) != 0);
      
                      ts = &l->l_ktrcsw;
                      switch (KTRFAC_VERSION(p->p_traceflag)) {
                      case 0:
                              kte->kte_kth.ktr_otv.tv_sec = ts->tv_sec;
                              kte->kte_kth.ktr_otv.tv_usec = ts->tv_nsec / 1000;
                              break;
                      case 1: 
                              kte->kte_kth.ktr_ots.tv_sec = ts->tv_sec;
                              kte->kte_kth.ktr_ots.tv_nsec = ts->tv_nsec;
                              break;
                      case 2:
                              kte->kte_kth.ktr_ts.tv_sec = ts->tv_sec;
                              kte->kte_kth.ktr_ts.tv_nsec = ts->tv_nsec;
                              break;
                      default:
                              break;
                      }
      
                      ktraddentry(l, kte, KTA_WAITOK);
              }
      
              if (ktealloc(&kte, (void *)&kc, l, KTR_CSW, sizeof(*kc)))
                      return;
      
              kc->out = 0;
              kc->user = user;
      
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      bool
      ktr_point(int fac_bit)
      {
              return curlwp->l_proc->p_traceflag & fac_bit;
      }
      
      int
      ktruser(const char *id, void *addr, size_t len, int ustr)
      {
              struct ktrace_entry *kte;
              struct ktr_user *ktp;
              lwp_t *l = curlwp;
              void *user_dta;
              int error;
      
              if (!KTRPOINT(l->l_proc, KTR_USER))
                      return 0;
      
              if (len > KTR_USER_MAXLEN)
                      return ENOSPC;
      
              error = ktealloc(&kte, (void *)&ktp, l, KTR_USER, sizeof(*ktp) + len);
              if (error != 0)
                      return error;
      
              if (ustr) {
                      if (copyinstr(id, ktp->ktr_id, KTR_USER_MAXIDLEN, NULL) != 0)
                              ktp->ktr_id[0] = '\0';
              } else
                      strncpy(ktp->ktr_id, id, KTR_USER_MAXIDLEN);
              ktp->ktr_id[KTR_USER_MAXIDLEN-1] = '\0';
      
              user_dta = (void *)(ktp + 1);
              if ((error = copyin(addr, user_dta, len)) != 0)
                      kte->kte_kth.ktr_len = 0;
      
              ktraddentry(l, kte, KTA_WAITOK);
              return error;
      }
      
      void
      ktr_kuser(const char *id, const void *addr, size_t len)
      {
              struct ktrace_entry *kte;
              struct ktr_user *ktp;
              lwp_t *l = curlwp;
              int error;
      
              if (!KTRPOINT(l->l_proc, KTR_USER))
                      return;
      
              if (len > KTR_USER_MAXLEN)
                      return;
      
              error = ktealloc(&kte, (void *)&ktp, l, KTR_USER, sizeof(*ktp) + len);
              if (error != 0)
                      return;
      
              strlcpy(ktp->ktr_id, id, KTR_USER_MAXIDLEN);
      
              memcpy(ktp + 1, addr, len);
      
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      void
      ktr_mib(const int *name, u_int namelen)
      {
              struct ktrace_entry *kte;
              int *namep;
              size_t size;
              lwp_t *l = curlwp;
      
              if (!KTRPOINT(l->l_proc, KTR_MIB))
                      return;
      
              size = namelen * sizeof(*name);
      
              if (ktealloc(&kte, (void *)&namep, l, KTR_MIB, size))
                      return;
      
              (void)memcpy(namep, name, namelen * sizeof(*name));
      
              ktraddentry(l, kte, KTA_WAITOK);
      }
      
      /* Interface and common routines */
      
      int
      ktrace_common(lwp_t *curl, int ops, int facs, int pid, file_t **fpp)
      {
              struct proc *p;
              struct pgrp *pg;
              struct ktr_desc *ktd = NULL, *nktd;
              file_t *fp = *fpp;
              int ret = 0;
              int error = 0;
              int descend;
      
              descend = ops & KTRFLAG_DESCEND;
              facs = facs & ~((unsigned) KTRFAC_PERSISTENT);
      
              (void)ktrenter(curl);
      
              switch (KTROP(ops)) {
      
              case KTROP_CLEARFILE:
                      /*
                       * Clear all uses of the tracefile
                       */
                      mutex_enter(&ktrace_lock);
                      ktd = ktd_lookup(fp);
                      mutex_exit(&ktrace_lock);
                      if (ktd == NULL)
                              goto done;
                      error = ktrderefall(ktd, 1);
                      goto done;
      
              case KTROP_SET:
                      mutex_enter(&ktrace_lock);
                      ktd = ktd_lookup(fp);
                      mutex_exit(&ktrace_lock);
                      if (ktd == NULL) {
                              nktd = kmem_alloc(sizeof(*nktd), KM_SLEEP);
                              TAILQ_INIT(&nktd->ktd_queue);
                              callout_init(&nktd->ktd_wakch, CALLOUT_MPSAFE);
                              cv_init(&nktd->ktd_cv, "ktrwait");
                              cv_init(&nktd->ktd_sync_cv, "ktrsync");
                              nktd->ktd_flags = 0;
                              nktd->ktd_qcount = 0;
                              nktd->ktd_error = 0;
                              nktd->ktd_errcnt = 0;
                              nktd->ktd_delayqcnt = ktd_delayqcnt;
                              nktd->ktd_wakedelay = mstohz(ktd_wakedelay);
                              nktd->ktd_intrwakdl = mstohz(ktd_intrwakdl);
                              nktd->ktd_ref = 0;
                              nktd->ktd_fp = fp;
                              mutex_enter(&ktrace_lock);
                              ktdref(nktd);
                              mutex_exit(&ktrace_lock);
      
                              /*
                               * XXX: not correct.  needs an way to detect
                               * whether ktruss or ktrace.
                               */
                              if (fp->f_type == DTYPE_PIPE)
                                      nktd->ktd_flags |= KTDF_INTERACTIVE;
      
                              mutex_enter(&fp->f_lock);
                              fp->f_count++;
                              mutex_exit(&fp->f_lock);
                              error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
                                  ktrace_thread, nktd, &nktd->ktd_lwp, "ktrace");
                              if (error != 0) {
                                      kmem_free(nktd, sizeof(*nktd));
                                      nktd = NULL;
                                      mutex_enter(&fp->f_lock);
                                      fp->f_count--;
                                      mutex_exit(&fp->f_lock);
                                      goto done;
                              }
      
                              mutex_enter(&ktrace_lock);
                              ktd = ktd_lookup(fp);
                              if (ktd != NULL) {
                                      ktdrel(nktd);
                                      nktd = NULL;
                              } else {
                                      TAILQ_INSERT_TAIL(&ktdq, nktd, ktd_list);
                                      ktd = nktd;
                              }
                              mutex_exit(&ktrace_lock);
                      }
                      break;
      
              case KTROP_CLEAR:
                      break;
              }
      
              /*
               * need something to (un)trace (XXX - why is this here?)
               */
              if (!facs) {
                      error = EINVAL;
                      *fpp = NULL;
                      goto done;
              }
      
              /*
               * do it
               */
              mutex_enter(proc_lock);
              if (pid < 0) {
                      /*
                       * by process group
                       */
                      pg = pgrp_find(-pid);
                      if (pg == NULL)
                              error = ESRCH;
                      else {
                              LIST_FOREACH(p, &pg->pg_members, p_pglist) {
                                      if (descend)
                                              ret |= ktrsetchildren(curl, p, ops,
                                                  facs, ktd);
                                      else
                                              ret |= ktrops(curl, p, ops, facs,
                                                  ktd);
                              }
                      }
      
              } else {
                      /*
                       * by pid
                       */
                      p = proc_find(pid);
                      if (p == NULL)
                              error = ESRCH;
                      else if (descend)
                              ret |= ktrsetchildren(curl, p, ops, facs, ktd);
                      else
                              ret |= ktrops(curl, p, ops, facs, ktd);
              }
              mutex_exit(proc_lock);
              if (error == 0 && !ret)
                      error = EPERM;
              *fpp = NULL;
      done:
              if (ktd != NULL) {
                      mutex_enter(&ktrace_lock);
                      if (error != 0) {
                              /*
                               * Wakeup the thread so that it can be die if we
                               * can't trace any process.
                               */
                              ktd_wakeup(ktd);
                      }
                      if (KTROP(ops) == KTROP_SET || KTROP(ops) == KTROP_CLEARFILE)
                              ktdrel(ktd);
                      mutex_exit(&ktrace_lock);
              }
              ktrexit(curl);
              return (error);
      }
      
      /*
       * fktrace system call
       */
      /* ARGSUSED */
      int
      sys_fktrace(struct lwp *l, const struct sys_fktrace_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(int) fd;
                      syscallarg(int) ops;
                      syscallarg(int) facs;
                      syscallarg(int) pid;
              } */
              file_t *fp;
              int error, fd;
      
              fd = SCARG(uap, fd);
              if ((fp = fd_getfile(fd)) == NULL)
                      return (EBADF);
              if ((fp->f_flag & FWRITE) == 0)
                      error = EBADF;
              else
                      error = ktrace_common(l, SCARG(uap, ops),
                          SCARG(uap, facs), SCARG(uap, pid), &fp);
              fd_putfile(fd);
              return error;
      }
      
      int
      ktrops(lwp_t *curl, struct proc *p, int ops, int facs,
          struct ktr_desc *ktd)
      {
              int vers = ops & KTRFAC_VER_MASK;
              int error = 0;
      
              mutex_enter(p->p_lock);
              mutex_enter(&ktrace_lock);
      
              if (!ktrcanset(curl, p))
                      goto out;
      
              switch (vers) {
              case KTRFACv0:
              case KTRFACv1:
              case KTRFACv2:
                      break;
              default:
                      error = EINVAL;
                      goto out;
              }
      
              if (KTROP(ops) == KTROP_SET) {
                      if (p->p_tracep != ktd) {
                              /*
                               * if trace file already in use, relinquish
                               */
                              ktrderef(p);
                              p->p_tracep = ktd;
                              ktradref(p);
                      }
                      p->p_traceflag |= facs;
                      if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KTRACE,
                          p, KAUTH_ARG(KAUTH_REQ_PROCESS_KTRACE_PERSISTENT), NULL,
                          NULL) == 0)
                              p->p_traceflag |= KTRFAC_PERSISTENT;
              } else {
                      /* KTROP_CLEAR */
                      if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
                              /* no more tracing */
                              ktrderef(p);
                      }
              }
      
              if (p->p_traceflag)
                      p->p_traceflag |= vers;
              /*
               * Emit an emulation record, every time there is a ktrace
               * change/attach request.
               */
              if (KTRPOINT(p, KTR_EMUL))
                      p->p_traceflag |= KTRFAC_TRC_EMUL;
      
              p->p_trace_enabled = trace_is_enabled(p);
      #ifdef __HAVE_SYSCALL_INTERN
              (*p->p_emul->e_syscall_intern)(p);
      #endif
      
       out:
               mutex_exit(&ktrace_lock);
               mutex_exit(p->p_lock);
      
              return error ? 0 : 1;
      }
      
      int
      ktrsetchildren(lwp_t *curl, struct proc *top, int ops, int facs,
          struct ktr_desc *ktd)
      {
              struct proc *p;
              int ret = 0;
      
              KASSERT(mutex_owned(proc_lock));
      
              p = top;
              for (;;) {
                      ret |= ktrops(curl, p, ops, facs, ktd);
                      /*
                       * If this process has children, descend to them next,
                       * otherwise do any siblings, and if done with this level,
                       * follow back up the tree (but not past top).
                       */
                      if (LIST_FIRST(&p->p_children) != NULL) {
                              p = LIST_FIRST(&p->p_children);
                              continue;
                      }
                      for (;;) {
                              if (p == top)
                                      return (ret);
                              if (LIST_NEXT(p, p_sibling) != NULL) {
                                      p = LIST_NEXT(p, p_sibling);
                                      break;
                              }
                              p = p->p_pptr;
                      }
              }
              /*NOTREACHED*/
      }
      
      void
      ktrwrite(struct ktr_desc *ktd, struct ktrace_entry *kte)
      {
              size_t hlen;
              struct uio auio;
              struct iovec aiov[64], *iov;
              struct ktrace_entry *top = kte;
              struct ktr_header *kth;
              file_t *fp = ktd->ktd_fp;
              int error;
      next:
              auio.uio_iov = iov = &aiov[0];
              auio.uio_offset = 0;
              auio.uio_rw = UIO_WRITE;
              auio.uio_resid = 0;
              auio.uio_iovcnt = 0;
              UIO_SETUP_SYSSPACE(&auio);
              do {
                      struct timespec ts;
                      lwpid_t lid;
                      kth = &kte->kte_kth;
      
                      hlen = sizeof(struct ktr_header);
                      switch (kth->ktr_version) {
                      case 0:
                              ts = kth->ktr_time;
      
                              kth->ktr_otv.tv_sec = ts.tv_sec;
                              kth->ktr_otv.tv_usec = ts.tv_nsec / 1000;
                              kth->ktr_unused = NULL;
                              hlen -= sizeof(kth->_v) -
                                  MAX(sizeof(kth->_v._v0), sizeof(kth->_v._v1));
                              break;
                      case 1:
                              ts = kth->ktr_time;
                              lid = kth->ktr_lid;
      
                              kth->ktr_ots.tv_sec = ts.tv_sec;
                              kth->ktr_ots.tv_nsec = ts.tv_nsec;
                              kth->ktr_olid = lid;
                              hlen -= sizeof(kth->_v) -
                                  MAX(sizeof(kth->_v._v0), sizeof(kth->_v._v1));
                              break;
                      }
                      iov->iov_base = (void *)kth;
                      iov++->iov_len = hlen;
                      auio.uio_resid += hlen;
                      auio.uio_iovcnt++;
                      if (kth->ktr_len > 0) {
                              iov->iov_base = kte->kte_buf;
                              iov++->iov_len = kth->ktr_len;
                              auio.uio_resid += kth->ktr_len;
                              auio.uio_iovcnt++;
                      }
              } while ((kte = TAILQ_NEXT(kte, kte_list)) != NULL &&
                  auio.uio_iovcnt < sizeof(aiov) / sizeof(aiov[0]) - 1);
      
      again:
              error = (*fp->f_ops->fo_write)(fp, &fp->f_offset, &auio,
                  fp->f_cred, FOF_UPDATE_OFFSET);
              switch (error) {
      
              case 0:
                      if (auio.uio_resid > 0)
                              goto again;
                      if (kte != NULL)
                              goto next;
                      break;
      
              case EWOULDBLOCK:
                      kpause("ktrzzz", false, 1, NULL);
                      goto again;
      
              default:
                      /*
                       * If error encountered, give up tracing on this
                       * vnode.  Don't report EPIPE as this can easily
                       * happen with fktrace()/ktruss.
                       */
      #ifndef DEBUG
                      if (error != EPIPE)
      #endif
                              log(LOG_NOTICE,
                                  "ktrace write failed, errno %d, tracing stopped\n",
                                  error);
                      (void)ktrderefall(ktd, 0);
              }
      
              while ((kte = top) != NULL) {
                      top = TAILQ_NEXT(top, kte_list);
                      ktefree(kte);
              }
      }
      
      void
      ktrace_thread(void *arg)
      {
              struct ktr_desc *ktd = arg;
              file_t *fp = ktd->ktd_fp;
              struct ktrace_entry *kte;
              int ktrerr, errcnt;
      
              mutex_enter(&ktrace_lock);
              for (;;) {
                      kte = TAILQ_FIRST(&ktd->ktd_queue);
                      if (kte == NULL) {
                              if (ktd->ktd_flags & KTDF_WAIT) {
                                      ktd->ktd_flags &= ~(KTDF_WAIT | KTDF_BLOCKING);
                                      cv_broadcast(&ktd->ktd_sync_cv);
                              }
                              if (ktd->ktd_ref == 0)
                                      break;
                              cv_wait(&ktd->ktd_cv, &ktrace_lock);
                              continue;
                      }
                      TAILQ_INIT(&ktd->ktd_queue);
                      ktd->ktd_qcount = 0;
                      ktrerr = ktd->ktd_error;
                      errcnt = ktd->ktd_errcnt;
                      ktd->ktd_error = ktd->ktd_errcnt = 0;
                      mutex_exit(&ktrace_lock);
      
                      if (ktrerr) {
                              log(LOG_NOTICE,
                                  "ktrace failed, fp %p, error 0x%x, total %d\n",
                                  fp, ktrerr, errcnt);
                      }
                      ktrwrite(ktd, kte);
                      mutex_enter(&ktrace_lock);
              }
      
              TAILQ_REMOVE(&ktdq, ktd, ktd_list);
      
              callout_halt(&ktd->ktd_wakch, &ktrace_lock);
              callout_destroy(&ktd->ktd_wakch);
              mutex_exit(&ktrace_lock);
      
              /*
               * ktrace file descriptor can't be watched (are not visible to
               * userspace), so no kqueue stuff here
               * XXX: The above comment is wrong, because the fktrace file
               * descriptor is available in userland.
               */
              closef(fp);
      
              cv_destroy(&ktd->ktd_sync_cv);
              cv_destroy(&ktd->ktd_cv);
      
              kmem_free(ktd, sizeof(*ktd));
      
              kthread_exit(0);
      }
      
      /*
       * Return true if caller has permission to set the ktracing state
       * of target.  Essentially, the target can't possess any
       * more permissions than the caller.  KTRFAC_PERSISTENT signifies that
       * the tracing will persist on sugid processes during exec; it is only
       * settable by a process with appropriate credentials.
       *
       * TODO: check groups.  use caller effective gid.
       */
      int
      ktrcanset(lwp_t *calll, struct proc *targetp)
      {
              KASSERT(mutex_owned(targetp->p_lock));
              KASSERT(mutex_owned(&ktrace_lock));
      
              if (kauth_authorize_process(calll->l_cred, KAUTH_PROCESS_KTRACE,
                  targetp, NULL, NULL, NULL) == 0)
                      return (1);
      
              return (0);
      }
      
      /*
       * Put user defined entry to ktrace records.
       */
      int
      sys_utrace(struct lwp *l, const struct sys_utrace_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(const char *) label;
                      syscallarg(void *) addr;
                      syscallarg(size_t) len;
              } */
      
              return ktruser(SCARG(uap, label), SCARG(uap, addr),
                  SCARG(uap, len), 1);
      }
      /*        $NetBSD: kern_synch.c,v 1.323 2019/02/03 03:19:28 mrg Exp $        */
      
      /*-
       * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009
       *    The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
       * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
       * Daniel Sieger.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*-
       * Copyright (c) 1982, 1986, 1990, 1991, 1993
       *        The Regents of the University of California.  All rights reserved.
       * (c) UNIX System Laboratories, Inc.
       * All or some portions of this file are derived from material licensed
       * to the University of California by American Telephone and Telegraph
       * Co. or Unix System Laboratories, Inc. and are reproduced herein with
       * the permission of UNIX System Laboratories, Inc.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.323 2019/02/03 03:19:28 mrg Exp $");
      
      #include "opt_kstack.h"
      #include "opt_dtrace.h"
      
      #define        __MUTEX_PRIVATE
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/proc.h>
      #include <sys/kernel.h>
      #include <sys/cpu.h>
      #include <sys/pserialize.h>
      #include <sys/resourcevar.h>
      #include <sys/sched.h>
      #include <sys/syscall_stats.h>
      #include <sys/sleepq.h>
      #include <sys/lockdebug.h>
      #include <sys/evcnt.h>
      #include <sys/intr.h>
      #include <sys/lwpctl.h>
      #include <sys/atomic.h>
      #include <sys/syslog.h>
      
      #include <uvm/uvm_extern.h>
      
      #include <dev/lockstat.h>
      
      #include <sys/dtrace_bsd.h>
      int                             dtrace_vtime_active=0;
      dtrace_vtime_switch_func_t      dtrace_vtime_switch_func;
      
      static void        sched_unsleep(struct lwp *, bool);
      static void        sched_changepri(struct lwp *, pri_t);
      static void        sched_lendpri(struct lwp *, pri_t);
      static void        resched_cpu(struct lwp *);
      
      syncobj_t sleep_syncobj = {
              .sobj_flag        = SOBJ_SLEEPQ_SORTED,
              .sobj_unsleep        = sleepq_unsleep,
              .sobj_changepri        = sleepq_changepri,
              .sobj_lendpri        = sleepq_lendpri,
              .sobj_owner        = syncobj_noowner,
      };
      
      syncobj_t sched_syncobj = {
              .sobj_flag        = SOBJ_SLEEPQ_SORTED,
              .sobj_unsleep        = sched_unsleep,
              .sobj_changepri        = sched_changepri,
              .sobj_lendpri        = sched_lendpri,
              .sobj_owner        = syncobj_noowner,
      };
      
      /* "Lightning bolt": once a second sleep address. */
      kcondvar_t                lbolt                        __cacheline_aligned;
      
      u_int                        sched_pstats_ticks        __cacheline_aligned;
      
      /* Preemption event counters. */
      static struct evcnt        kpreempt_ev_crit        __cacheline_aligned;
      static struct evcnt        kpreempt_ev_klock        __cacheline_aligned;
      static struct evcnt        kpreempt_ev_immed        __cacheline_aligned;
      
      void
      synch_init(void)
      {
      
              cv_init(&lbolt, "lbolt");
      
              evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
                 "kpreempt", "defer: critical section");
              evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
                 "kpreempt", "defer: kernel_lock");
              evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
                 "kpreempt", "immediate");
      }
      
      /*
       * OBSOLETE INTERFACE
       *
       * General sleep call.  Suspends the current LWP until a wakeup is
       * performed on the specified identifier.  The LWP will then be made
       * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
       * means no timeout).  If pri includes PCATCH flag, signals are checked
       * before and after sleeping, else signals are not checked.  Returns 0 if
       * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
       * signal needs to be delivered, ERESTART is returned if the current system
       * call should be restarted if possible, and EINTR is returned if the system
       * call should be interrupted by the signal (return EINTR).
       */
      int
      tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo)
      {
              struct lwp *l = curlwp;
              sleepq_t *sq;
              kmutex_t *mp;
      
              KASSERT((l->l_pflag & LP_INTR) == 0);
              KASSERT(ident != &lbolt);
      
              if (sleepq_dontsleep(l)) {
                      (void)sleepq_abort(NULL, 0);
                      return 0;
              }
      
              l->l_kpriority = true;
              sq = sleeptab_lookup(&sleeptab, ident, &mp);
              sleepq_enter(sq, l, mp);
              sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
              return sleepq_block(timo, priority & PCATCH);
      }
      
      int
      mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
              kmutex_t *mtx)
      {
              struct lwp *l = curlwp;
              sleepq_t *sq;
              kmutex_t *mp;
              int error;
      
              KASSERT((l->l_pflag & LP_INTR) == 0);
              KASSERT(ident != &lbolt);
      
              if (sleepq_dontsleep(l)) {
                      (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
                      return 0;
              }
      
              l->l_kpriority = true;
              sq = sleeptab_lookup(&sleeptab, ident, &mp);
              sleepq_enter(sq, l, mp);
              sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
              mutex_exit(mtx);
              error = sleepq_block(timo, priority & PCATCH);
      
              if ((priority & PNORELOCK) == 0)
                      mutex_enter(mtx);
      
              return error;
      }
      
      /*
       * General sleep call for situations where a wake-up is not expected.
       */
      int
      kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
      {
              struct lwp *l = curlwp;
              kmutex_t *mp;
              sleepq_t *sq;
              int error;
      
              KASSERT(!(timo == 0 && intr == false));
      
              if (sleepq_dontsleep(l))
                      return sleepq_abort(NULL, 0);
      
              if (mtx != NULL)
                      mutex_exit(mtx);
              l->l_kpriority = true;
              sq = sleeptab_lookup(&sleeptab, l, &mp);
              sleepq_enter(sq, l, mp);
              sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
              error = sleepq_block(timo, intr);
              if (mtx != NULL)
                      mutex_enter(mtx);
      
              return error;
      }
      
      /*
       * OBSOLETE INTERFACE
       *
       * Make all LWPs sleeping on the specified identifier runnable.
       */
      void
      wakeup(wchan_t ident)
      {
              sleepq_t *sq;
              kmutex_t *mp;
      
              if (__predict_false(cold))
                      return;
      
              sq = sleeptab_lookup(&sleeptab, ident, &mp);
              sleepq_wake(sq, ident, (u_int)-1, mp);
      }
      
      /*
       * General yield call.  Puts the current LWP back on its run queue and
       * performs a voluntary context switch.  Should only be called when the
       * current LWP explicitly requests it (eg sched_yield(2)).
       */
      void
      yield(void)
      {
              struct lwp *l = curlwp;
      
              KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
              lwp_lock(l);
              KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
              KASSERT(l->l_stat == LSONPROC);
              l->l_kpriority = false;
              (void)mi_switch(l);
              KERNEL_LOCK(l->l_biglocks, l);
      }
      
      /*
       * General preemption call.  Puts the current LWP back on its run queue
       * and performs an involuntary context switch.
       */
      void
      preempt(void)
      {
              struct lwp *l = curlwp;
      
              KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
              lwp_lock(l);
              KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
              KASSERT(l->l_stat == LSONPROC);
              l->l_kpriority = false;
              l->l_pflag |= LP_PREEMPTING;
              (void)mi_switch(l);
              KERNEL_LOCK(l->l_biglocks, l);
      }
      
      /*
       * Handle a request made by another agent to preempt the current LWP
       * in-kernel.  Usually called when l_dopreempt may be non-zero.
       *
       * Character addresses for lockstat only.
       */
      static char        in_critical_section;
      static char        kernel_lock_held;
      static char        is_softint;
      static char        cpu_kpreempt_enter_fail;
      
      bool
      kpreempt(uintptr_t where)
      {
              uintptr_t failed;
              lwp_t *l;
              int s, dop, lsflag;
      
              l = curlwp;
              failed = 0;
              while ((dop = l->l_dopreempt) != 0) {
                      if (l->l_stat != LSONPROC) {
                              /*
                               * About to block (or die), let it happen.
                               * Doesn't really count as "preemption has
                               * been blocked", since we're going to
                               * context switch.
                               */
                              l->l_dopreempt = 0;
                              return true;
                      }
                      if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
                              /* Can't preempt idle loop, don't count as failure. */
                              l->l_dopreempt = 0;
                              return true;
                      }
                      if (__predict_false(l->l_nopreempt != 0)) {
                              /* LWP holds preemption disabled, explicitly. */
                              if ((dop & DOPREEMPT_COUNTED) == 0) {
                                      kpreempt_ev_crit.ev_count++;
                              }
                              failed = (uintptr_t)&in_critical_section;
                              break;
                      }
                      if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
                              /* Can't preempt soft interrupts yet. */
                              l->l_dopreempt = 0;
                              failed = (uintptr_t)&is_softint;
                              break;
                      }
                      s = splsched();
                      if (__predict_false(l->l_blcnt != 0 ||
                          curcpu()->ci_biglock_wanted != NULL)) {
                              /* Hold or want kernel_lock, code is not MT safe. */
                              splx(s);
                              if ((dop & DOPREEMPT_COUNTED) == 0) {
                                      kpreempt_ev_klock.ev_count++;
                              }
                              failed = (uintptr_t)&kernel_lock_held;
                              break;
                      }
                      if (__predict_false(!cpu_kpreempt_enter(where, s))) {
                              /*
                               * It may be that the IPL is too high.
                               * kpreempt_enter() can schedule an
                               * interrupt to retry later.
                               */
                              splx(s);
                              failed = (uintptr_t)&cpu_kpreempt_enter_fail;
                              break;
                      }
                      /* Do it! */
                      if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
                              kpreempt_ev_immed.ev_count++;
                      }
                      lwp_lock(l);
                      mi_switch(l);
                      l->l_nopreempt++;
                      splx(s);
      
                      /* Take care of any MD cleanup. */
                      cpu_kpreempt_exit(where);
                      l->l_nopreempt--;
              }
      
              if (__predict_true(!failed)) {
                      return false;
              }
      
              /* Record preemption failure for reporting via lockstat. */
              atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
              lsflag = 0;
              LOCKSTAT_ENTER(lsflag);
              if (__predict_false(lsflag)) {
                      if (where == 0) {
                              where = (uintptr_t)__builtin_return_address(0);
                      }
                      /* Preemption is on, might recurse, so make it atomic. */
                      if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL,
                          (void *)where) == NULL) {
                              LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
                              l->l_pfaillock = failed;
                      }
              }
              LOCKSTAT_EXIT(lsflag);
              return true;
      }
      
      /*
       * Return true if preemption is explicitly disabled.
       */
      bool
      kpreempt_disabled(void)
      {
    1         const lwp_t *l = curlwp;
      
              return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
    1             (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
      }
      
      /*
       * Disable kernel preemption.
       */
      void
      kpreempt_disable(void)
      {
      
    6         KPREEMPT_DISABLE(curlwp);
      }
      
      /*
       * Reenable kernel preemption.
       */
      void
      kpreempt_enable(void)
      {
      
    6         KPREEMPT_ENABLE(curlwp);
      }
      
      /*
       * Compute the amount of time during which the current lwp was running.
       *
       * - update l_rtime unless it's an idle lwp.
       */
      
      void
      updatertime(lwp_t *l, const struct bintime *now)
      {
      
              if (__predict_false(l->l_flag & LW_IDLE))
                      return;
      
              /* rtime += now - stime */
              bintime_add(&l->l_rtime, now);
              bintime_sub(&l->l_rtime, &l->l_stime);
      }
      
      /*
       * Select next LWP from the current CPU to run..
       */
      static inline lwp_t *
      nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc)
      {
              lwp_t *newl;
      
              /*
               * Let sched_nextlwp() select the LWP to run the CPU next.
               * If no LWP is runnable, select the idle LWP.
               * 
               * Note that spc_lwplock might not necessary be held, and
               * new thread would be unlocked after setting the LWP-lock.
               */
              newl = sched_nextlwp();
              if (newl != NULL) {
                      sched_dequeue(newl);
                      KASSERT(lwp_locked(newl, spc->spc_mutex));
                      KASSERT(newl->l_cpu == ci);
                      newl->l_stat = LSONPROC;
                      newl->l_pflag |= LP_RUNNING;
                      lwp_setlock(newl, spc->spc_lwplock);
              } else {
                      newl = ci->ci_data.cpu_idlelwp;
                      newl->l_stat = LSONPROC;
                      newl->l_pflag |= LP_RUNNING;
              }
      
              /*
               * Only clear want_resched if there are no pending (slow)
               * software interrupts.
               */
              ci->ci_want_resched = ci->ci_data.cpu_softints;
              spc->spc_flags &= ~SPCF_SWITCHCLEAR;
              spc->spc_curpriority = lwp_eprio(newl);
      
              return newl;
      }
      
      /*
       * The machine independent parts of context switch.
       *
       * Returns 1 if another LWP was actually run.
       */
      int
      mi_switch(lwp_t *l)
      {
              struct cpu_info *ci;
              struct schedstate_percpu *spc;
              struct lwp *newl;
              int retval, oldspl;
              struct bintime bt;
              bool returning;
      
              KASSERT(lwp_locked(l, NULL));
              KASSERT(kpreempt_disabled());
              LOCKDEBUG_BARRIER(l->l_mutex, 1);
      
              kstack_check_magic(l);
      
              binuptime(&bt);
      
              KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
              KASSERT((l->l_pflag & LP_RUNNING) != 0);
              KASSERT(l->l_cpu == curcpu());
              ci = l->l_cpu;
              spc = &ci->ci_schedstate;
              returning = false;
              newl = NULL;
      
              /*
               * If we have been asked to switch to a specific LWP, then there
               * is no need to inspect the run queues.  If a soft interrupt is
               * blocking, then return to the interrupted thread without adjusting
               * VM context or its start time: neither have been changed in order
               * to take the interrupt.
               */
              if (l->l_switchto != NULL) {
                      if ((l->l_pflag & LP_INTR) != 0) {
                              returning = true;
                              softint_block(l);
                              if ((l->l_pflag & LP_TIMEINTR) != 0)
                                      updatertime(l, &bt);
                      }
                      newl = l->l_switchto;
                      l->l_switchto = NULL;
              }
      #ifndef __HAVE_FAST_SOFTINTS
              else if (ci->ci_data.cpu_softints != 0) {
                      /* There are pending soft interrupts, so pick one. */
                      newl = softint_picklwp();
                      newl->l_stat = LSONPROC;
                      newl->l_pflag |= LP_RUNNING;
              }
      #endif        /* !__HAVE_FAST_SOFTINTS */
      
              /* Count time spent in current system call */
              if (!returning) {
                      SYSCALL_TIME_SLEEP(l);
      
                      updatertime(l, &bt);
              }
      
              /* Lock the runqueue */
              KASSERT(l->l_stat != LSRUN);
              mutex_spin_enter(spc->spc_mutex);
      
              /*
               * If on the CPU and we have gotten this far, then we must yield.
               */
              if (l->l_stat == LSONPROC && l != newl) {
                      KASSERT(lwp_locked(l, spc->spc_lwplock));
                      if ((l->l_flag & LW_IDLE) == 0) {
                              l->l_stat = LSRUN;
                              lwp_setlock(l, spc->spc_mutex);
                              sched_enqueue(l, true);
                              /*
                               * Handle migration.  Note that "migrating LWP" may
                               * be reset here, if interrupt/preemption happens
                               * early in idle LWP.
                               */
                              if (l->l_target_cpu != NULL &&
                                  (l->l_pflag & LP_BOUND) == 0) {
                                      KASSERT((l->l_pflag & LP_INTR) == 0);
                                      spc->spc_migrating = l;
                              }
                      } else
                              l->l_stat = LSIDL;
              }
      
              /* Pick new LWP to run. */
              if (newl == NULL) {
                      newl = nextlwp(ci, spc);
              }
      
              /* Items that must be updated with the CPU locked. */
              if (!returning) {
                      /* Update the new LWP's start time. */
                      newl->l_stime = bt;
      
                      /*
                       * ci_curlwp changes when a fast soft interrupt occurs.
                       * We use cpu_onproc to keep track of which kernel or
                       * user thread is running 'underneath' the software
                       * interrupt.  This is important for time accounting,
                       * itimers and forcing user threads to preempt (aston).
                       */
                      ci->ci_data.cpu_onproc = newl;
              }
      
              /*
               * Preemption related tasks.  Must be done with the current
               * CPU locked.
               */
              cpu_did_resched(l);
              l->l_dopreempt = 0;
              if (__predict_false(l->l_pfailaddr != 0)) {
                      LOCKSTAT_FLAG(lsflag);
                      LOCKSTAT_ENTER(lsflag);
                      LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
                      LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
                          1, l->l_pfailtime, l->l_pfailaddr);
                      LOCKSTAT_EXIT(lsflag);
                      l->l_pfailtime = 0;
                      l->l_pfaillock = 0;
                      l->l_pfailaddr = 0;
              }
      
              if (l != newl) {
                      struct lwp *prevlwp;
      
                      /* Release all locks, but leave the current LWP locked */
                      if (l->l_mutex == spc->spc_mutex) {
                              /*
                               * Drop spc_lwplock, if the current LWP has been moved
                               * to the run queue (it is now locked by spc_mutex).
                               */
                              mutex_spin_exit(spc->spc_lwplock);
                      } else {
                              /*
                               * Otherwise, drop the spc_mutex, we are done with the
                               * run queues.
                               */
                              mutex_spin_exit(spc->spc_mutex);
                      }
      
                      /*
                       * Mark that context switch is going to be performed
                       * for this LWP, to protect it from being switched
                       * to on another CPU.
                       */
                      KASSERT(l->l_ctxswtch == 0);
                      l->l_ctxswtch = 1;
                      l->l_ncsw++;
                      if ((l->l_pflag & LP_PREEMPTING) != 0)
                              l->l_nivcsw++;
                      l->l_pflag &= ~LP_PREEMPTING;
                      KASSERT((l->l_pflag & LP_RUNNING) != 0);
                      l->l_pflag &= ~LP_RUNNING;
      
                      /*
                       * Increase the count of spin-mutexes before the release
                       * of the last lock - we must remain at IPL_SCHED during
                       * the context switch.
                       */
                      KASSERTMSG(ci->ci_mtx_count == -1,
                          "%s: cpu%u: ci_mtx_count (%d) != -1 "
                          "(block with spin-mutex held)",
                           __func__, cpu_index(ci), ci->ci_mtx_count);
                      oldspl = MUTEX_SPIN_OLDSPL(ci);
                      ci->ci_mtx_count--;
                      lwp_unlock(l);
      
                      /* Count the context switch on this CPU. */
                      ci->ci_data.cpu_nswtch++;
      
                      /* Update status for lwpctl, if present. */
                      if (l->l_lwpctl != NULL)
                              l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
      
                      /*
                       * Save old VM context, unless a soft interrupt
                       * handler is blocking.
                       */
                      if (!returning)
                              pmap_deactivate(l);
      
                      /*
                       * We may need to spin-wait if 'newl' is still
                       * context switching on another CPU.
                       */
                      if (__predict_false(newl->l_ctxswtch != 0)) {
                              u_int count;
                              count = SPINLOCK_BACKOFF_MIN;
                              while (newl->l_ctxswtch)
                                      SPINLOCK_BACKOFF(count);
                      }
      
                      /*
                       * If DTrace has set the active vtime enum to anything
                       * other than INACTIVE (0), then it should have set the
                       * function to call.
                       */
                      if (__predict_false(dtrace_vtime_active)) {
                              (*dtrace_vtime_switch_func)(newl);
                      }
      
                      /*
                       * We must ensure not to come here from inside a read section.
                       */
                      KASSERT(pserialize_not_in_read_section());
      
                      /* Switch to the new LWP.. */
      #ifdef MULTIPROCESSOR
                      KASSERT(curlwp == ci->ci_curlwp);
      #endif
                      KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
                      prevlwp = cpu_switchto(l, newl, returning);
                      ci = curcpu();
      #ifdef MULTIPROCESSOR
                      KASSERT(curlwp == ci->ci_curlwp);
      #endif
                      KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p",
                          l, curlwp, prevlwp);
      
                      /*
                       * Switched away - we have new curlwp.
                       * Restore VM context and IPL.
                       */
                      pmap_activate(l);
                      pcu_switchpoint(l);
      
                      if (prevlwp != NULL) {
                              /* Normalize the count of the spin-mutexes */
                              ci->ci_mtx_count++;
                              /* Unmark the state of context switch */
                              membar_exit();
                              prevlwp->l_ctxswtch = 0;
                      }
      
                      /* Update status for lwpctl, if present. */
                      if (l->l_lwpctl != NULL) {
                              l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
                              l->l_lwpctl->lc_pctr++;
                      }
      
                      /* Note trip through cpu_switchto(). */
                      pserialize_switchpoint();
      
                      KASSERT(l->l_cpu == ci);
                      splx(oldspl);
                      /*
                       * note that, unless the caller disabled preemption,
                       * we can be preempted at any time after the above splx() call.
                       */
                      retval = 1;
              } else {
                      /* Nothing to do - just unlock and return. */
                      pserialize_switchpoint();
                      mutex_spin_exit(spc->spc_mutex);
                      l->l_pflag &= ~LP_PREEMPTING;
                      lwp_unlock(l);
                      retval = 0;
              }
      
              KASSERT(l == curlwp);
              KASSERT(l->l_stat == LSONPROC);
      
              SYSCALL_TIME_WAKEUP(l);
              LOCKDEBUG_BARRIER(NULL, 1);
      
              return retval;
      }
      
      /*
       * The machine independent parts of context switch to oblivion.
       * Does not return.  Call with the LWP unlocked.
       */
      void
      lwp_exit_switchaway(lwp_t *l)
      {
              struct cpu_info *ci;
              struct lwp *newl;
              struct bintime bt;
      
              ci = l->l_cpu;
      
              KASSERT(kpreempt_disabled());
              KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL);
              KASSERT(ci == curcpu());
              LOCKDEBUG_BARRIER(NULL, 0);
      
              kstack_check_magic(l);
      
              /* Count time spent in current system call */
              SYSCALL_TIME_SLEEP(l);
              binuptime(&bt);
              updatertime(l, &bt);
      
              /* Must stay at IPL_SCHED even after releasing run queue lock. */
              (void)splsched();
      
              /*
               * Let sched_nextlwp() select the LWP to run the CPU next.
               * If no LWP is runnable, select the idle LWP.
               * 
               * Note that spc_lwplock might not necessary be held, and
               * new thread would be unlocked after setting the LWP-lock.
               */
              spc_lock(ci);
      #ifndef __HAVE_FAST_SOFTINTS
              if (ci->ci_data.cpu_softints != 0) {
                      /* There are pending soft interrupts, so pick one. */
                      newl = softint_picklwp();
                      newl->l_stat = LSONPROC;
                      newl->l_pflag |= LP_RUNNING;
              } else 
      #endif        /* !__HAVE_FAST_SOFTINTS */
              {
                      newl = nextlwp(ci, &ci->ci_schedstate);
              }
      
              /* Update the new LWP's start time. */
              newl->l_stime = bt;
              l->l_pflag &= ~LP_RUNNING;
      
              /*
               * ci_curlwp changes when a fast soft interrupt occurs.
               * We use cpu_onproc to keep track of which kernel or
               * user thread is running 'underneath' the software
               * interrupt.  This is important for time accounting,
               * itimers and forcing user threads to preempt (aston).
               */
              ci->ci_data.cpu_onproc = newl;
      
              /*
               * Preemption related tasks.  Must be done with the current
               * CPU locked.
               */
              cpu_did_resched(l);
      
              /* Unlock the run queue. */
              spc_unlock(ci);
      
              /* Count the context switch on this CPU. */
              ci->ci_data.cpu_nswtch++;
      
              /* Update status for lwpctl, if present. */
              if (l->l_lwpctl != NULL)
                      l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED;
      
              /*
               * We may need to spin-wait if 'newl' is still
               * context switching on another CPU.
               */
              if (__predict_false(newl->l_ctxswtch != 0)) {
                      u_int count;
                      count = SPINLOCK_BACKOFF_MIN;
                      while (newl->l_ctxswtch)
                              SPINLOCK_BACKOFF(count);
              }
      
              /*
               * If DTrace has set the active vtime enum to anything
               * other than INACTIVE (0), then it should have set the
               * function to call.
               */
              if (__predict_false(dtrace_vtime_active)) {
                      (*dtrace_vtime_switch_func)(newl);
              }
      
              /* Switch to the new LWP.. */
              (void)cpu_switchto(NULL, newl, false);
      
              for (;;) continue;        /* XXX: convince gcc about "noreturn" */
              /* NOTREACHED */
      }
      
      /*
       * setrunnable: change LWP state to be runnable, placing it on the run queue.
       *
       * Call with the process and LWP locked.  Will return with the LWP unlocked.
       */
      void
      setrunnable(struct lwp *l)
      {
              struct proc *p = l->l_proc;
              struct cpu_info *ci;
      
              KASSERT((l->l_flag & LW_IDLE) == 0);
              KASSERT(mutex_owned(p->p_lock));
              KASSERT(lwp_locked(l, NULL));
              KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
      
              switch (l->l_stat) {
              case LSSTOP:
                      /*
                       * If we're being traced (possibly because someone attached us
                       * while we were stopped), check for a signal from the debugger.
                       */
                      if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0)
                              signotify(l);
                      p->p_nrlwps++;
                      break;
              case LSSUSPENDED:
                      l->l_flag &= ~LW_WSUSPEND;
                      p->p_nrlwps++;
                      cv_broadcast(&p->p_lwpcv);
                      break;
              case LSSLEEP:
                      KASSERT(l->l_wchan != NULL);
                      break;
              default:
                      panic("setrunnable: lwp %p state was %d", l, l->l_stat);
              }
      
              /*
               * If the LWP was sleeping, start it again.
               */
              if (l->l_wchan != NULL) {
                      l->l_stat = LSSLEEP;
                      /* lwp_unsleep() will release the lock. */
                      lwp_unsleep(l, true);
                      return;
              }
      
              /*
               * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
               * about to call mi_switch(), in which case it will yield.
               */
              if ((l->l_pflag & LP_RUNNING) != 0) {
                      l->l_stat = LSONPROC;
                      l->l_slptime = 0;
                      lwp_unlock(l);
                      return;
              }
      
              /*
               * Look for a CPU to run.
               * Set the LWP runnable.
               */
              ci = sched_takecpu(l);
              l->l_cpu = ci;
              spc_lock(ci);
              lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
              sched_setrunnable(l);
              l->l_stat = LSRUN;
              l->l_slptime = 0;
      
              sched_enqueue(l, false);
              resched_cpu(l);
              lwp_unlock(l);
      }
      
      /*
       * suspendsched:
       *
       *        Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 
       */
      void
      suspendsched(void)
      {
              CPU_INFO_ITERATOR cii;
              struct cpu_info *ci;
              struct lwp *l;
              struct proc *p;
      
              /*
               * We do this by process in order not to violate the locking rules.
               */
              mutex_enter(proc_lock);
              PROCLIST_FOREACH(p, &allproc) {
                      mutex_enter(p->p_lock);
                      if ((p->p_flag & PK_SYSTEM) != 0) {
                              mutex_exit(p->p_lock);
                              continue;
                      }
      
                      if (p->p_stat != SSTOP) {
                              if (p->p_stat != SZOMB && p->p_stat != SDEAD) {
                                      p->p_pptr->p_nstopchild++;
                                      p->p_waited = 0;
                              }
                              p->p_stat = SSTOP;
                      }
      
                      LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                              if (l == curlwp)
                                      continue;
      
                              lwp_lock(l);
      
                              /*
                               * Set L_WREBOOT so that the LWP will suspend itself
                               * when it tries to return to user mode.  We want to
                               * try and get to get as many LWPs as possible to
                               * the user / kernel boundary, so that they will
                               * release any locks that they hold.
                               */
                              l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
      
                              if (l->l_stat == LSSLEEP &&
                                  (l->l_flag & LW_SINTR) != 0) {
                                      /* setrunnable() will release the lock. */
                                      setrunnable(l);
                                      continue;
                              }
      
                              lwp_unlock(l);
                      }
      
                      mutex_exit(p->p_lock);
              }
              mutex_exit(proc_lock);
      
              /*
               * Kick all CPUs to make them preempt any LWPs running in user mode. 
               * They'll trap into the kernel and suspend themselves in userret().
               */
              for (CPU_INFO_FOREACH(cii, ci)) {
                      spc_lock(ci);
                      cpu_need_resched(ci, RESCHED_IMMED);
                      spc_unlock(ci);
              }
      }
      
      /*
       * sched_unsleep:
       *
       *        The is called when the LWP has not been awoken normally but instead
       *        interrupted: for example, if the sleep timed out.  Because of this,
       *        it's not a valid action for running or idle LWPs.
       */
      static void
      sched_unsleep(struct lwp *l, bool cleanup)
      {
      
              lwp_unlock(l);
              panic("sched_unsleep");
      }
      
      static void
      resched_cpu(struct lwp *l)
      {
              struct cpu_info *ci = l->l_cpu;
      
              KASSERT(lwp_locked(l, NULL));
              if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
                      cpu_need_resched(ci, 0);
      }
      
      static void
      sched_changepri(struct lwp *l, pri_t pri)
      {
      
              KASSERT(lwp_locked(l, NULL));
      
              if (l->l_stat == LSRUN) {
                      KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
                      sched_dequeue(l);
                      l->l_priority = pri;
                      sched_enqueue(l, false);
              } else {
                      l->l_priority = pri;
              }
              resched_cpu(l);
      }
      
      static void
      sched_lendpri(struct lwp *l, pri_t pri)
      {
      
              KASSERT(lwp_locked(l, NULL));
      
              if (l->l_stat == LSRUN) {
                      KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
                      sched_dequeue(l);
                      l->l_inheritedprio = pri;
                      l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
                      sched_enqueue(l, false);
              } else {
                      l->l_inheritedprio = pri;
                      l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
              }
              resched_cpu(l);
      }
      
      struct lwp *
      syncobj_noowner(wchan_t wchan)
      {
      
              return NULL;
      }
      
      /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */
      const fixpt_t ccpu = 0.95122942450071400909 * FSCALE;
      
      /*
       * Constants for averages over 1, 5 and 15 minutes when sampling at
       * 5 second intervals.
       */
      static const fixpt_t cexp[ ] = {
              0.9200444146293232 * FSCALE,        /* exp(-1/12) */
              0.9834714538216174 * FSCALE,        /* exp(-1/60) */
              0.9944598480048967 * FSCALE,        /* exp(-1/180) */
      };
      
      /*
       * sched_pstats:
       *
       * => Update process statistics and check CPU resource allocation.
       * => Call scheduler-specific hook to eventually adjust LWP priorities.
       * => Compute load average of a quantity on 1, 5 and 15 minute intervals.
       */
      void
      sched_pstats(void)
      {
              extern struct loadavg averunnable;
              struct loadavg *avg = &averunnable;
              const int clkhz = (stathz != 0 ? stathz : hz);
              static bool backwards = false;
              static u_int lavg_count = 0;
              struct proc *p;
              int nrun;
      
              sched_pstats_ticks++;
              if (++lavg_count >= 5) {
                      lavg_count = 0;
                      nrun = 0;
              }
              mutex_enter(proc_lock);
              PROCLIST_FOREACH(p, &allproc) {
                      struct lwp *l;
                      struct rlimit *rlim;
                      time_t runtm;
                      int sig;
      
                      /* Increment sleep time (if sleeping), ignore overflow. */
                      mutex_enter(p->p_lock);
                      runtm = p->p_rtime.sec;
                      LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                              fixpt_t lpctcpu;
                              u_int lcpticks;
      
                              if (__predict_false((l->l_flag & LW_IDLE) != 0))
                                      continue;
                              lwp_lock(l);
                              runtm += l->l_rtime.sec;
                              l->l_swtime++;
                              sched_lwp_stats(l);
      
                              /* For load average calculation. */
                              if (__predict_false(lavg_count == 0) &&
                                  (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) {
                                      switch (l->l_stat) {
                                      case LSSLEEP:
                                              if (l->l_slptime > 1) {
                                                      break;
                                              }
                                              /* FALLTHROUGH */
                                      case LSRUN:
                                      case LSONPROC:
                                      case LSIDL:
                                              nrun++;
                                      }
                              }
                              lwp_unlock(l);
      
                              l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
                              if (l->l_slptime != 0)
                                      continue;
      
                              lpctcpu = l->l_pctcpu;
                              lcpticks = atomic_swap_uint(&l->l_cpticks, 0);
                              lpctcpu += ((FSCALE - ccpu) *
                                  (lcpticks * FSCALE / clkhz)) >> FSHIFT;
                              l->l_pctcpu = lpctcpu;
                      }
                      /* Calculating p_pctcpu only for ps(1) */
                      p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
      
                      if (__predict_false(runtm < 0)) {
                              if (!backwards) {
                                      backwards = true;
                                      printf("WARNING: negative runtime; "
                                          "monotonic clock has gone backwards\n");
                              }
                              mutex_exit(p->p_lock);
                              continue;
                      }
      
                      /*
                       * Check if the process exceeds its CPU resource allocation.
                       * If over the hard limit, kill it with SIGKILL.
                       * If over the soft limit, send SIGXCPU and raise
                       * the soft limit a little.
                       */
                      rlim = &p->p_rlimit[RLIMIT_CPU];
                      sig = 0;
                      if (__predict_false(runtm >= rlim->rlim_cur)) {
                              if (runtm >= rlim->rlim_max) {
                                      sig = SIGKILL;
                                      log(LOG_NOTICE,
                                          "pid %d, command %s, is killed: %s\n",
                                          p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
                                      uprintf("pid %d, command %s, is killed: %s\n",
                                          p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
                              } else {
                                      sig = SIGXCPU;
                                      if (rlim->rlim_cur < rlim->rlim_max)
                                              rlim->rlim_cur += 5;
                              }
                      }
                      mutex_exit(p->p_lock);
                      if (__predict_false(sig)) {
                              KASSERT((p->p_flag & PK_SYSTEM) == 0);
                              psignal(p, sig);
                      }
              }
              mutex_exit(proc_lock);
      
              /* Load average calculation. */
              if (__predict_false(lavg_count == 0)) {
                      int i;
                      CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg));
                      for (i = 0; i < __arraycount(cexp); i++) {
                              avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
                                  nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
                      }
              }
      
              /* Lightning bolt. */
              cv_broadcast(&lbolt);
      }
      /*        $NetBSD: sys_syscall.c,v 1.12 2018/12/02 21:00:13 maxv Exp $        */
      
      /*-
       * Copyright (c) 2006 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by David Laight.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: sys_syscall.c,v 1.12 2018/12/02 21:00:13 maxv Exp $");
      
      #include <sys/syscall_stats.h>
      #include <sys/syscallvar.h>
      
      /*
       * MI indirect system call support.
       * Included from sys_indirect.c and compat/netbsd32/netbsd32_indirect.c
       *
       * SYS_SYSCALL is set to the required function name.
       */
      
      #define CONCAT(a,b) __CONCAT(a,b)
      
      int
      SYS_SYSCALL(struct lwp *l, const struct CONCAT(SYS_SYSCALL, _args) *uap,
          register_t *rval)
      {
              /* {
                      syscallarg(int) code;
                      syscallarg(register_t) args[SYS_MAXSYSARGS];
              } */
              const struct sysent *callp;
    6         struct proc *p = l->l_proc;
              int code;
              int error;
      #ifdef NETBSD32_SYSCALL
              register_t args64[SYS_MAXSYSARGS];
              int i, narg;
              #define TRACE_ARGS args64
      #else
              #define TRACE_ARGS &SCARG(uap, args[0])
      #endif
      
              callp = p->p_emul->e_sysent;
      
              code = SCARG(uap, code) & (SYS_NSYSENT - 1);
              SYSCALL_COUNT(syscall_counts, code);
              callp += code;
      
              if (__predict_false(callp->sy_flags & SYCALL_INDIRECT))
                      return ENOSYS;
      
    6         if (__predict_true(!p->p_trace_enabled))
    6                 return sy_call(callp, l, &uap->args, rval);
      
      #ifdef NETBSD32_SYSCALL
              narg = callp->sy_narg;
              for (i = 0; i < narg; i++)
                      args64[i] = SCARG(uap, args[i]);
      #endif
      
              error = trace_enter(code, callp, TRACE_ARGS);
              if (__predict_false(error != 0))
                      return error;
              kleak_fill_stack();
              error = sy_call(callp, l, &uap->args, rval);
              trace_exit(code, callp, &uap->args, rval, error);
              return error;
      
              #undef TRACE_ARGS
      }
      /*        $NetBSD: procfs_vfsops.c,v 1.101 2019/03/30 23:28:30 christos Exp $        */
      
      /*
       * Copyright (c) 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * This code is derived from software contributed to Berkeley by
       * Jan-Simon Pendry.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)procfs_vfsops.c        8.7 (Berkeley) 5/10/95
       */
      
      /*
       * Copyright (c) 1993 Jan-Simon Pendry
       *
       * This code is derived from software contributed to Berkeley by
       * Jan-Simon Pendry.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. All advertising materials mentioning features or use of this software
       *    must display the following acknowledgement:
       *        This product includes software developed by the University of
       *        California, Berkeley and its contributors.
       * 4. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)procfs_vfsops.c        8.7 (Berkeley) 5/10/95
       */
      
      /*
       * procfs VFS interface
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: procfs_vfsops.c,v 1.101 2019/03/30 23:28:30 christos Exp $");
      
      #if defined(_KERNEL_OPT)
      #include "opt_compat_netbsd.h"
      #endif
      
      #include <sys/param.h>
      #include <sys/time.h>
      #include <sys/kernel.h>
      #include <sys/systm.h>
      #include <sys/sysctl.h>
      #include <sys/proc.h>
      #include <sys/buf.h>
      #include <sys/syslog.h>
      #include <sys/mount.h>
      #include <sys/dirent.h>
      #include <sys/signalvar.h>
      #include <sys/vnode.h>
      #include <sys/file.h>
      #include <sys/filedesc.h>
      #include <sys/kauth.h>
      #include <sys/module.h>
      
      #include <miscfs/genfs/genfs.h>
      
      #include <miscfs/procfs/procfs.h>
      
      #include <uvm/uvm_extern.h>                        /* for PAGE_SIZE */
      
      MODULE(MODULE_CLASS_VFS, procfs, "ptrace_common");
      
      VFS_PROTOS(procfs);
      
      static struct sysctllog *procfs_sysctl_log;
      
      static kauth_listener_t procfs_listener;
      
      /*
       * VFS Operations.
       *
       * mount system call
       */
      /* ARGSUSED */
      int
      procfs_mount(
          struct mount *mp,
          const char *path,
          void *data,
          size_t *data_len)
      {
              struct lwp *l = curlwp;
              struct procfsmount *pmnt;
              struct procfs_args *args = data;
              int error;
      
              if (args == NULL)
                      return EINVAL;
      
              if (UIO_MX & (UIO_MX-1)) {
                      log(LOG_ERR, "procfs: invalid directory entry size");
                      return (EINVAL);
              }
      
              if (mp->mnt_flag & MNT_GETARGS) {
                      if (*data_len < sizeof *args)
                              return EINVAL;
      
                      pmnt = VFSTOPROC(mp);
                      if (pmnt == NULL)
                              return EIO;
                      args->version = PROCFS_ARGSVERSION;
                      args->flags = pmnt->pmnt_flags;
                      *data_len = sizeof *args;
                      return 0;
              }
      
              if (mp->mnt_flag & MNT_UPDATE)
                      return (EOPNOTSUPP);
      
              if (*data_len >= sizeof *args && args->version != PROCFS_ARGSVERSION)
                      return EINVAL;
      
              pmnt = kmem_zalloc(sizeof(struct procfsmount), KM_SLEEP);
      
              mp->mnt_stat.f_namemax = PROCFS_MAXNAMLEN;
              mp->mnt_flag |= MNT_LOCAL;
              mp->mnt_data = pmnt;
              vfs_getnewfsid(mp);
      
              error = set_statvfs_info(path, UIO_USERSPACE, "procfs", UIO_SYSSPACE,
                  mp->mnt_op->vfs_name, mp, l);
              pmnt->pmnt_exechook = exechook_establish(procfs_revoke_vnodes, mp);
              if (*data_len >= sizeof *args)
                      pmnt->pmnt_flags = args->flags;
              else
                      pmnt->pmnt_flags = 0;
      
              mp->mnt_iflag |= IMNT_MPSAFE;
              return error;
      }
      
      /*
       * unmount system call
       */
      int
      procfs_unmount(struct mount *mp, int mntflags)
      {
              int error;
              int flags = 0;
      
              if (mntflags & MNT_FORCE)
                      flags |= FORCECLOSE;
      
              if ((error = vflush(mp, 0, flags)) != 0)
                      return (error);
      
              exechook_disestablish(VFSTOPROC(mp)->pmnt_exechook);
      
              kmem_free(mp->mnt_data, sizeof(struct procfsmount));
              mp->mnt_data = NULL;
      
              return 0;
      }
      
      int
      procfs_root(struct mount *mp, struct vnode **vpp)
      {
              int error;
      
              error = procfs_allocvp(mp, vpp, 0, PFSroot, -1);
              if (error == 0) {
                      error = vn_lock(*vpp, LK_EXCLUSIVE);
                      if (error != 0) {
                              vrele(*vpp);
                              *vpp = NULL;
                      }
              }
      
              return error;
      }
      
      /* ARGSUSED */
      int
      procfs_start(struct mount *mp, int flags)
      {
      
              return (0);
      }
      
      /*
       * Get file system statistics.
       */
      int
      procfs_statvfs(struct mount *mp, struct statvfs *sbp)
      {
      
              genfs_statvfs(mp, sbp);
      
              sbp->f_bsize = PAGE_SIZE;
              sbp->f_frsize = PAGE_SIZE;
              sbp->f_iosize = PAGE_SIZE;
              sbp->f_blocks = 1;
              sbp->f_files = maxproc;                        /* approx */
              sbp->f_ffree = maxproc - nprocs;        /* approx */
              sbp->f_favail = maxproc - nprocs;        /* approx */
      
              return (0);
      }
      
      /*ARGSUSED*/
      int
      procfs_sync(
          struct mount *mp,
          int waitfor,
          kauth_cred_t uc)
      {
      
              return (0);
      }
      
      /*ARGSUSED*/
      int
      procfs_vget(struct mount *mp, ino_t ino,
          struct vnode **vpp)
      {
              return (EOPNOTSUPP);
      }
      
      int
      procfs_loadvnode(struct mount *mp, struct vnode *vp,
          const void *key, size_t key_len, const void **new_key)
      {
              int error;
              struct pfskey pfskey;
              struct pfsnode *pfs;
      
              KASSERT(key_len == sizeof(pfskey));
              memcpy(&pfskey, key, key_len);
      
              pfs = kmem_alloc(sizeof(*pfs), KM_SLEEP);
              pfs->pfs_pid = pfskey.pk_pid;
              pfs->pfs_type = pfskey.pk_type;
              pfs->pfs_fd = pfskey.pk_fd;
              pfs->pfs_vnode = vp;
              pfs->pfs_flags = 0;
              pfs->pfs_fileno =
                  PROCFS_FILENO(pfs->pfs_pid, pfs->pfs_type, pfs->pfs_fd);
              vp->v_tag = VT_PROCFS;
              vp->v_op = procfs_vnodeop_p;
              vp->v_data = pfs;
      
              switch (pfs->pfs_type) {
              case PFSroot:        /* /proc = dr-xr-xr-x */
                      vp->v_vflag |= VV_ROOT;
                      /*FALLTHROUGH*/
              case PFSproc:        /* /proc/N = dr-xr-xr-x */
                      pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
                      vp->v_type = VDIR;
                      break;
      
              case PFStask:        /* /proc/N/task = dr-xr-xr-x */
                      if (pfs->pfs_fd == -1) {
                              pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|
                                  S_IROTH|S_IXOTH;
                              vp->v_type = VDIR;
                              break;
                      }
                      /*FALLTHROUGH*/
              case PFScurproc:        /* /proc/curproc = lr-xr-xr-x */
              case PFSself:        /* /proc/self    = lr-xr-xr-x */
              case PFScwd:        /* /proc/N/cwd = lr-xr-xr-x */
              case PFSchroot:        /* /proc/N/chroot = lr-xr-xr-x */
              case PFSexe:        /* /proc/N/exe = lr-xr-xr-x */
                      pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
                      vp->v_type = VLNK;
                      break;
      
              case PFSfd:
                      if (pfs->pfs_fd == -1) {        /* /proc/N/fd = dr-x------ */
                              pfs->pfs_mode = S_IRUSR|S_IXUSR;
                              vp->v_type = VDIR;
                      } else {        /* /proc/N/fd/M = [ps-]rw------- */
                              file_t *fp;
                              vnode_t *vxp;
                              struct proc *p;
      
                              mutex_enter(proc_lock);
                              p = proc_find(pfs->pfs_pid);
                              mutex_exit(proc_lock);
                              if (p == NULL) {
                                      error = ENOENT;
                                      goto bad;
                              }
                              KASSERT(rw_read_held(&p->p_reflock));
                              if ((fp = fd_getfile2(p, pfs->pfs_fd)) == NULL) {
                                      error = EBADF;
                                      goto bad;
                              }
      
                              pfs->pfs_mode = S_IRUSR|S_IWUSR;
                              switch (fp->f_type) {
                              case DTYPE_VNODE:
                                      vxp = fp->f_vnode;
      
                                      /*
                                       * We make symlinks for directories
                                       * to avoid cycles.
                                       */
                                      if (vxp->v_type == VDIR)
                                              goto symlink;
                                      vp->v_type = vxp->v_type;
                                      break;
                              case DTYPE_PIPE:
                                      vp->v_type = VFIFO;
                                      break;
                              case DTYPE_SOCKET:
                                      vp->v_type = VSOCK;
                                      break;
                              case DTYPE_KQUEUE:
                              case DTYPE_MISC:
                              case DTYPE_SEM:
                              symlink:
                                      pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|
                                          S_IXGRP|S_IROTH|S_IXOTH;
                                      vp->v_type = VLNK;
                                      break;
                              default:
                                      error = EOPNOTSUPP;
                                      closef(fp);
                                      goto bad;
                              }
                              closef(fp);
                      }
                      break;
      
              case PFSfile:        /* /proc/N/file = -rw------- */
              case PFSmem:        /* /proc/N/mem = -rw------- */
              case PFSregs:        /* /proc/N/regs = -rw------- */
              case PFSfpregs:        /* /proc/N/fpregs = -rw------- */
                      pfs->pfs_mode = S_IRUSR|S_IWUSR;
                      vp->v_type = VREG;
                      break;
      
              case PFSnote:        /* /proc/N/note = --w------ */
              case PFSnotepg:        /* /proc/N/notepg = --w------ */
                      pfs->pfs_mode = S_IWUSR;
                      vp->v_type = VREG;
                      break;
      
              case PFSmap:                /* /proc/N/map = -r-------- */
              case PFSmaps:                /* /proc/N/maps = -r-------- */
              case PFSauxv:                /* /proc/N/auxv = -r-------- */
                      pfs->pfs_mode = S_IRUSR;
                      vp->v_type = VREG;
                      break;
      
              case PFSstatus:                /* /proc/N/status = -r--r--r-- */
              case PFSstat:                /* /proc/N/stat = -r--r--r-- */
              case PFScmdline:        /* /proc/N/cmdline = -r--r--r-- */
              case PFSenviron:        /* /proc/N/environ = -r--r--r-- */
              case PFSemul:                /* /proc/N/emul = -r--r--r-- */
              case PFSmeminfo:        /* /proc/meminfo = -r--r--r-- */
              case PFScpustat:        /* /proc/stat = -r--r--r-- */
              case PFSdevices:        /* /proc/devices = -r--r--r-- */
              case PFScpuinfo:        /* /proc/cpuinfo = -r--r--r-- */
              case PFSuptime:                /* /proc/uptime = -r--r--r-- */
              case PFSmounts:                /* /proc/mounts = -r--r--r-- */
              case PFSloadavg:        /* /proc/loadavg = -r--r--r-- */
              case PFSstatm:                /* /proc/N/statm = -r--r--r-- */
              case PFSversion:        /* /proc/version = -r--r--r-- */
              case PFSlimit:                /* /proc/limit = -r--r--r-- */
                      pfs->pfs_mode = S_IRUSR|S_IRGRP|S_IROTH;
                      vp->v_type = VREG;
                      break;
      
      #ifdef __HAVE_PROCFS_MACHDEP
              PROCFS_MACHDEP_NODETYPE_CASES
                      procfs_machdep_allocvp(vp);
                      break;
      #endif
      
              default:
                      panic("procfs_allocvp");
              }
      
              uvm_vnp_setsize(vp, 0);
              *new_key = &pfs->pfs_key;
      
              return 0;
      
      bad:
              vp->v_tag =VT_NON;
              vp->v_type = VNON;
              vp->v_op = NULL;
              vp->v_data = NULL;
              kmem_free(pfs, sizeof(*pfs));
              return error;
      }
      
      void
      procfs_init(void)
      {
      
      }
      
      void
      procfs_reinit(void)
      {
      
      }
      
      void
      procfs_done(void)
      {
      
      }
      
      extern const struct vnodeopv_desc procfs_vnodeop_opv_desc;
      
      const struct vnodeopv_desc * const procfs_vnodeopv_descs[] = {
              &procfs_vnodeop_opv_desc,
              NULL,
      };
      
      struct vfsops procfs_vfsops = {
              .vfs_name = MOUNT_PROCFS,
              .vfs_min_mount_data = sizeof (struct procfs_args),
              .vfs_mount = procfs_mount,
              .vfs_start = procfs_start,
              .vfs_unmount = procfs_unmount,
              .vfs_root = procfs_root,
              .vfs_quotactl = (void *)eopnotsupp,
              .vfs_statvfs = procfs_statvfs,
              .vfs_sync = procfs_sync,
              .vfs_vget = procfs_vget,
              .vfs_loadvnode = procfs_loadvnode,
              .vfs_fhtovp = (void *)eopnotsupp,
              .vfs_vptofh = (void *)eopnotsupp,
              .vfs_init = procfs_init,
              .vfs_reinit = procfs_reinit,
              .vfs_done = procfs_done,
              .vfs_snapshot = (void *)eopnotsupp,
              .vfs_extattrctl = vfs_stdextattrctl,
              .vfs_suspendctl = genfs_suspendctl,
              .vfs_renamelock_enter = genfs_renamelock_enter,
              .vfs_renamelock_exit = genfs_renamelock_exit,
              .vfs_fsync = (void *)eopnotsupp,
              .vfs_opv_descs = procfs_vnodeopv_descs
      };
      
      static int
      procfs_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
          void *arg0, void *arg1, void *arg2, void *arg3)
      {
              struct proc *p;
              struct pfsnode *pfs;
              int result;
      
              result = KAUTH_RESULT_DEFER;
              p = arg0;
              pfs = arg1;
      
    1         if (action != KAUTH_PROCESS_PROCFS)
    1                 return result;
      
              switch (pfs->pfs_type) {
              case PFSregs:
              case PFSfpregs:
              case PFSmem:
                      if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) ||
                          ISSET(p->p_flag, PK_SUGID))
                              break;
      
                      /*FALLTHROUGH*/
              default:
                      result = KAUTH_RESULT_ALLOW;
                      break;
              }
      
              return result;
      }
      
      
      static int
      procfs_modcmd(modcmd_t cmd, void *arg)
      {
              int error;
      
              switch (cmd) {
              case MODULE_CMD_INIT:
                      error = vfs_attach(&procfs_vfsops);
                      if (error != 0)
                              break;
                      sysctl_createv(&procfs_sysctl_log, 0, NULL, NULL,
                                     CTLFLAG_PERMANENT,
                                     CTLTYPE_NODE, "procfs",
                                     SYSCTL_DESCR("Process file system"),
                                     NULL, 0, NULL, 0,
                                     CTL_VFS, 12, CTL_EOL);
                      /*
                       * XXX the "12" above could be dynamic, thereby eliminating
                       * one more instance of the "number to vfs" mapping problem,
                       * but "12" is the order as taken from sys/mount.h
                       */
      
                      procfs_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
                          procfs_listener_cb, NULL);
      
                      break;
              case MODULE_CMD_FINI:
                      error = vfs_detach(&procfs_vfsops);
                      if (error != 0)
                              break;
                      sysctl_teardown(&procfs_sysctl_log);
                      kauth_unlisten_scope(procfs_listener);
                      break;
              default:
                      error = ENOTTY;
                      break;
              }
      
              return (error);
      }
      /*        $NetBSD: kern_sig.c,v 1.364 2019/06/21 04:28:12 kamil Exp $        */
      
      /*-
       * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Copyright (c) 1982, 1986, 1989, 1991, 1993
       *        The Regents of the University of California.  All rights reserved.
       * (c) UNIX System Laboratories, Inc.
       * All or some portions of this file are derived from material licensed
       * to the University of California by American Telephone and Telegraph
       * Co. or Unix System Laboratories, Inc. and are reproduced herein with
       * the permission of UNIX System Laboratories, Inc.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)kern_sig.c        8.14 (Berkeley) 5/14/95
       */
      
      /*
       * Signal subsystem.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_sig.c,v 1.364 2019/06/21 04:28:12 kamil Exp $");
      
      #include "opt_ptrace.h"
      #include "opt_dtrace.h"
      #include "opt_compat_sunos.h"
      #include "opt_compat_netbsd.h"
      #include "opt_compat_netbsd32.h"
      #include "opt_pax.h"
      
      #define        SIGPROP                /* include signal properties table */
      #include <sys/param.h>
      #include <sys/signalvar.h>
      #include <sys/proc.h>
      #include <sys/ptrace.h>
      #include <sys/systm.h>
      #include <sys/wait.h>
      #include <sys/ktrace.h>
      #include <sys/syslog.h>
      #include <sys/filedesc.h>
      #include <sys/file.h>
      #include <sys/pool.h>
      #include <sys/ucontext.h>
      #include <sys/exec.h>
      #include <sys/kauth.h>
      #include <sys/acct.h>
      #include <sys/callout.h>
      #include <sys/atomic.h>
      #include <sys/cpu.h>
      #include <sys/module.h>
      #include <sys/sdt.h>
      
      #ifdef PAX_SEGVGUARD
      #include <sys/pax.h>
      #endif /* PAX_SEGVGUARD */
      
      #include <uvm/uvm_extern.h>
      
      #define        SIGQUEUE_MAX        32
      static pool_cache_t        sigacts_cache        __read_mostly;
      static pool_cache_t        ksiginfo_cache        __read_mostly;
      static callout_t        proc_stop_ch        __cacheline_aligned;
      
      sigset_t                contsigmask        __cacheline_aligned;
      sigset_t                stopsigmask        __cacheline_aligned;
      static sigset_t                vforksigmask        __cacheline_aligned;
      sigset_t                sigcantmask        __cacheline_aligned;
      
      static void        ksiginfo_exechook(struct proc *, void *);
      static void        proc_stop(struct proc *, int);
      static void        proc_stop_done(struct proc *, int);
      static void        proc_stop_callout(void *);
      static int        sigchecktrace(void);
      static int        sigpost(struct lwp *, sig_t, int, int);
      static int        sigput(sigpend_t *, struct proc *, ksiginfo_t *);
      static int        sigunwait(struct proc *, const ksiginfo_t *);
      
      static void        sigacts_poolpage_free(struct pool *, void *);
      static void        *sigacts_poolpage_alloc(struct pool *, int);
      
      void (*sendsig_sigcontext_vec)(const struct ksiginfo *, const sigset_t *);
      int (*coredump_vec)(struct lwp *, const char *) =
          (int (*)(struct lwp *, const char *))enosys;
      
      /*
       * DTrace SDT provider definitions
       */
      SDT_PROVIDER_DECLARE(proc);
      SDT_PROBE_DEFINE3(proc, kernel, , signal__send,
          "struct lwp *",         /* target thread */
          "struct proc *",         /* target process */
          "int");                /* signal */
      SDT_PROBE_DEFINE3(proc, kernel, , signal__discard,
          "struct lwp *",        /* target thread */
          "struct proc *",        /* target process */
          "int");                  /* signal */
      SDT_PROBE_DEFINE3(proc, kernel, , signal__handle,
          "int",                 /* signal */
          "ksiginfo_t *",         /* signal info */
          "void (*)(void)");        /* handler address */
      
      
      static struct pool_allocator sigactspool_allocator = {
              .pa_alloc = sigacts_poolpage_alloc,
              .pa_free = sigacts_poolpage_free
      };
      
      #ifdef DEBUG
      int        kern_logsigexit = 1;
      #else
      int        kern_logsigexit = 0;
      #endif
      
      static const char logcoredump[] =
          "pid %d (%s), uid %d: exited on signal %d (core dumped)\n";
      static const char lognocoredump[] =
          "pid %d (%s), uid %d: exited on signal %d (core not dumped, err = %d)\n";
      
      static kauth_listener_t signal_listener;
      
      static int
      signal_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
          void *arg0, void *arg1, void *arg2, void *arg3)
      {
              struct proc *p;
              int result, signum;
      
              result = KAUTH_RESULT_DEFER;
              p = arg0;
              signum = (int)(unsigned long)arg1;
      
    1         if (action != KAUTH_PROCESS_SIGNAL)
    1                 return result;
      
              if (kauth_cred_uidmatch(cred, p->p_cred) ||
                  (signum == SIGCONT && (curproc->p_session == p->p_session)))
                      result = KAUTH_RESULT_ALLOW;
      
              return result;
      }
      
      static int
      sigacts_ctor(void *arg __unused, void *obj, int flags __unused)
      {
              memset(obj, 0, sizeof(struct sigacts));
              return 0;
      }
      
      /*
       * signal_init:
       *
       *        Initialize global signal-related data structures.
       */
      void
      signal_init(void)
      {
      
              sigactspool_allocator.pa_pagesz = (PAGE_SIZE)*2;
      
              sigacts_cache = pool_cache_init(sizeof(struct sigacts), 0, 0, 0,
                  "sigacts", sizeof(struct sigacts) > PAGE_SIZE ?
                  &sigactspool_allocator : NULL, IPL_NONE, sigacts_ctor, NULL, NULL);
              ksiginfo_cache = pool_cache_init(sizeof(ksiginfo_t), 0, 0, 0,
                  "ksiginfo", NULL, IPL_VM, NULL, NULL, NULL);
      
              exechook_establish(ksiginfo_exechook, NULL);
      
              callout_init(&proc_stop_ch, CALLOUT_MPSAFE);
              callout_setfunc(&proc_stop_ch, proc_stop_callout, NULL);
      
              signal_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
                  signal_listener_cb, NULL);
      }
      
      /*
       * sigacts_poolpage_alloc:
       *
       *        Allocate a page for the sigacts memory pool.
       */
      static void *
      sigacts_poolpage_alloc(struct pool *pp, int flags)
      {
      
              return (void *)uvm_km_alloc(kernel_map,
                  PAGE_SIZE * 2, PAGE_SIZE * 2,
                  ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
                  | UVM_KMF_WIRED);
      }
      
      /*
       * sigacts_poolpage_free:
       *
       *        Free a page on behalf of the sigacts memory pool.
       */
      static void
      sigacts_poolpage_free(struct pool *pp, void *v)
      {
      
              uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * 2, UVM_KMF_WIRED);
      }
      
      /*
       * sigactsinit:
       *
       *        Create an initial sigacts structure, using the same signal state
       *        as of specified process.  If 'share' is set, share the sigacts by
       *        holding a reference, otherwise just copy it from parent.
       */
      struct sigacts *
      sigactsinit(struct proc *pp, int share)
      {
              struct sigacts *ps = pp->p_sigacts, *ps2;
      
              if (__predict_false(share)) {
                      atomic_inc_uint(&ps->sa_refcnt);
                      return ps;
              }
              ps2 = pool_cache_get(sigacts_cache, PR_WAITOK);
              mutex_init(&ps2->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
              ps2->sa_refcnt = 1;
      
              mutex_enter(&ps->sa_mutex);
              memcpy(ps2->sa_sigdesc, ps->sa_sigdesc, sizeof(ps2->sa_sigdesc));
              mutex_exit(&ps->sa_mutex);
              return ps2;
      }
      
      /*
       * sigactsunshare:
       *
       *        Make this process not share its sigacts, maintaining all signal state.
       */
      void
      sigactsunshare(struct proc *p)
      {
              struct sigacts *ps, *oldps = p->p_sigacts;
      
              if (__predict_true(oldps->sa_refcnt == 1))
                      return;
      
              ps = pool_cache_get(sigacts_cache, PR_WAITOK);
              mutex_init(&ps->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
              memcpy(ps->sa_sigdesc, oldps->sa_sigdesc, sizeof(ps->sa_sigdesc));
              ps->sa_refcnt = 1;
      
              p->p_sigacts = ps;
              sigactsfree(oldps);
      }
      
      /*
       * sigactsfree;
       *
       *        Release a sigacts structure.
       */
      void
      sigactsfree(struct sigacts *ps)
      {
      
              if (atomic_dec_uint_nv(&ps->sa_refcnt) == 0) {
                      mutex_destroy(&ps->sa_mutex);
                      pool_cache_put(sigacts_cache, ps);
              }
      }
      
      /*
       * siginit:
       *
       *        Initialize signal state for process 0; set to ignore signals that
       *        are ignored by default and disable the signal stack.  Locking not
       *        required as the system is still cold.
       */
      void
      siginit(struct proc *p)
      {
              struct lwp *l;
              struct sigacts *ps;
              int signo, prop;
      
              ps = p->p_sigacts;
              sigemptyset(&contsigmask);
              sigemptyset(&stopsigmask);
              sigemptyset(&vforksigmask);
              sigemptyset(&sigcantmask);
              for (signo = 1; signo < NSIG; signo++) {
                      prop = sigprop[signo];
                      if (prop & SA_CONT)
                              sigaddset(&contsigmask, signo);
                      if (prop & SA_STOP)
                              sigaddset(&stopsigmask, signo);
                      if (prop & SA_STOP && signo != SIGSTOP)
                              sigaddset(&vforksigmask, signo);
                      if (prop & SA_CANTMASK)
                              sigaddset(&sigcantmask, signo);
                      if (prop & SA_IGNORE && signo != SIGCONT)
                              sigaddset(&p->p_sigctx.ps_sigignore, signo);
                      sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
                      SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
              }
              sigemptyset(&p->p_sigctx.ps_sigcatch);
              p->p_sflag &= ~PS_NOCLDSTOP;
      
              ksiginfo_queue_init(&p->p_sigpend.sp_info);
              sigemptyset(&p->p_sigpend.sp_set);
      
              /*
               * Reset per LWP state.
               */
              l = LIST_FIRST(&p->p_lwps);
              l->l_sigwaited = NULL;
              l->l_sigstk = SS_INIT;
              ksiginfo_queue_init(&l->l_sigpend.sp_info);
              sigemptyset(&l->l_sigpend.sp_set);
      
              /* One reference. */
              ps->sa_refcnt = 1;
      }
      
      /*
       * execsigs:
       *
       *        Reset signals for an exec of the specified process.
       */
      void
      execsigs(struct proc *p)
      {
              struct sigacts *ps;
              struct lwp *l;
              int signo, prop;
              sigset_t tset;
              ksiginfoq_t kq;
      
              KASSERT(p->p_nlwps == 1);
      
              sigactsunshare(p);
              ps = p->p_sigacts;
      
              /*
               * Reset caught signals.  Held signals remain held through
               * l->l_sigmask (unless they were caught, and are now ignored
               * by default).
               *
               * No need to lock yet, the process has only one LWP and
               * at this point the sigacts are private to the process.
               */
              sigemptyset(&tset);
              for (signo = 1; signo < NSIG; signo++) {
                      if (sigismember(&p->p_sigctx.ps_sigcatch, signo)) {
                              prop = sigprop[signo];
                              if (prop & SA_IGNORE) {
                                      if ((prop & SA_CONT) == 0)
                                              sigaddset(&p->p_sigctx.ps_sigignore,
                                                  signo);
                                      sigaddset(&tset, signo);
                              }
                              SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
                      }
                      sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
                      SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
              }
              ksiginfo_queue_init(&kq);
      
              mutex_enter(p->p_lock);
              sigclearall(p, &tset, &kq);
              sigemptyset(&p->p_sigctx.ps_sigcatch);
      
              /*
               * Reset no zombies if child dies flag as Solaris does.
               */
              p->p_flag &= ~(PK_NOCLDWAIT | PK_CLDSIGIGN);
              if (SIGACTION_PS(ps, SIGCHLD).sa_handler == SIG_IGN)
                      SIGACTION_PS(ps, SIGCHLD).sa_handler = SIG_DFL;
      
              /*
               * Reset per-LWP state.
               */
              l = LIST_FIRST(&p->p_lwps);
              l->l_sigwaited = NULL;
              l->l_sigstk = SS_INIT;
              ksiginfo_queue_init(&l->l_sigpend.sp_info);
              sigemptyset(&l->l_sigpend.sp_set);
              mutex_exit(p->p_lock);
      
              ksiginfo_queue_drain(&kq);
      }
      
      /*
       * ksiginfo_exechook:
       *
       *        Free all pending ksiginfo entries from a process on exec.
       *        Additionally, drain any unused ksiginfo structures in the
       *        system back to the pool.
       *
       *        XXX This should not be a hook, every process has signals.
       */
      static void
      ksiginfo_exechook(struct proc *p, void *v)
      {
              ksiginfoq_t kq;
      
              ksiginfo_queue_init(&kq);
      
              mutex_enter(p->p_lock);
              sigclearall(p, NULL, &kq);
              mutex_exit(p->p_lock);
      
              ksiginfo_queue_drain(&kq);
      }
      
      /*
       * ksiginfo_alloc:
       *
       *        Allocate a new ksiginfo structure from the pool, and optionally copy
       *        an existing one.  If the existing ksiginfo_t is from the pool, and
       *        has not been queued somewhere, then just return it.  Additionally,
       *        if the existing ksiginfo_t does not contain any information beyond
       *        the signal number, then just return it.
       */
      ksiginfo_t *
      ksiginfo_alloc(struct proc *p, ksiginfo_t *ok, int flags)
      {
              ksiginfo_t *kp;
      
              if (ok != NULL) {
                      if ((ok->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) ==
                          KSI_FROMPOOL)
                              return ok;
                      if (KSI_EMPTY_P(ok))
                              return ok;
              }
      
              kp = pool_cache_get(ksiginfo_cache, flags);
              if (kp == NULL) {
      #ifdef DIAGNOSTIC
                      printf("Out of memory allocating ksiginfo for pid %d\n",
                          p->p_pid);
      #endif
                      return NULL;
              }
      
              if (ok != NULL) {
                      memcpy(kp, ok, sizeof(*kp));
                      kp->ksi_flags &= ~KSI_QUEUED;
              } else
                      KSI_INIT_EMPTY(kp);
      
              kp->ksi_flags |= KSI_FROMPOOL;
      
              return kp;
      }
      
      /*
       * ksiginfo_free:
       *
       *        If the given ksiginfo_t is from the pool and has not been queued,
       *        then free it.
       */
      void
      ksiginfo_free(ksiginfo_t *kp)
      {
      
              if ((kp->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) != KSI_FROMPOOL)
                      return;
              pool_cache_put(ksiginfo_cache, kp);
      }
      
      /*
       * ksiginfo_queue_drain:
       *
       *        Drain a non-empty ksiginfo_t queue.
       */
      void
      ksiginfo_queue_drain0(ksiginfoq_t *kq)
      {
              ksiginfo_t *ksi;
      
              KASSERT(!TAILQ_EMPTY(kq));
      
              while (!TAILQ_EMPTY(kq)) {
                      ksi = TAILQ_FIRST(kq);
                      TAILQ_REMOVE(kq, ksi, ksi_list);
                      pool_cache_put(ksiginfo_cache, ksi);
              }
      }
      
      static int
      siggetinfo(sigpend_t *sp, ksiginfo_t *out, int signo)
      {
              ksiginfo_t *ksi, *nksi;
      
              if (sp == NULL)
                      goto out;
      
              /* Find siginfo and copy it out. */
              int count = 0;
              TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, nksi) {
                      if (ksi->ksi_signo != signo)
                              continue;
                      if (count++ > 0) /* Only remove the first, count all of them */
                              continue; 
                      TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list);
                      KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
                      KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0);
                      ksi->ksi_flags &= ~KSI_QUEUED;
                      if (out != NULL) {
                              memcpy(out, ksi, sizeof(*out));
                              out->ksi_flags &= ~(KSI_FROMPOOL | KSI_QUEUED);
                      }
                      ksiginfo_free(ksi);
              }
              if (count)
                      return count;
      
      out:
              /* If there is no siginfo, then manufacture it. */
              if (out != NULL) {
                      KSI_INIT(out);
                      out->ksi_info._signo = signo;
                      out->ksi_info._code = SI_NOINFO;
              }
              return 0;
      }
      
      /*
       * sigget:
       *
       *        Fetch the first pending signal from a set.  Optionally, also fetch
       *        or manufacture a ksiginfo element.  Returns the number of the first
       *        pending signal, or zero.
       */ 
      int
      sigget(sigpend_t *sp, ksiginfo_t *out, int signo, const sigset_t *mask)
      {
              sigset_t tset;
              int count;
      
              /* If there's no pending set, the signal is from the debugger. */
              if (sp == NULL)
                      goto out;
      
              /* Construct mask from signo, and 'mask'. */
              if (signo == 0) {
                      if (mask != NULL) {
                              tset = *mask;
                              __sigandset(&sp->sp_set, &tset);
                      } else
                              tset = sp->sp_set;
      
                      /* If there are no signals pending - return. */
                      if ((signo = firstsig(&tset)) == 0)
                              goto out;
              } else {
                      KASSERT(sigismember(&sp->sp_set, signo));
              }
      
              sigdelset(&sp->sp_set, signo);
      out:
              count = siggetinfo(sp, out, signo);
              if (count > 1)
                      sigaddset(&sp->sp_set, signo);
              return signo;
      }
      
      /*
       * sigput:
       *
       *        Append a new ksiginfo element to the list of pending ksiginfo's.
       */
      static int
      sigput(sigpend_t *sp, struct proc *p, ksiginfo_t *ksi)
      {
              ksiginfo_t *kp;
      
              KASSERT(mutex_owned(p->p_lock));
              KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0);
      
              sigaddset(&sp->sp_set, ksi->ksi_signo);
      
              /*
               * If there is no siginfo, we are done.
               */
              if (KSI_EMPTY_P(ksi))
                      return 0;
      
              KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
      
              size_t count = 0;
              TAILQ_FOREACH(kp, &sp->sp_info, ksi_list) {
                      count++;
                      if (ksi->ksi_signo >= SIGRTMIN && ksi->ksi_signo <= SIGRTMAX)
                              continue;
                      if (kp->ksi_signo == ksi->ksi_signo) {
                              KSI_COPY(ksi, kp);
                              kp->ksi_flags |= KSI_QUEUED;
                              return 0;
                      }
              }
              
              if (count >= SIGQUEUE_MAX) {
      #ifdef DIAGNOSTIC
                      printf("%s(%d): Signal queue is full signal=%d\n",
                          p->p_comm, p->p_pid, ksi->ksi_signo);
      #endif
                      return EAGAIN;
              }
              ksi->ksi_flags |= KSI_QUEUED;
              TAILQ_INSERT_TAIL(&sp->sp_info, ksi, ksi_list);
              
              return 0;
      }
      
      /*
       * sigclear:
       *
       *        Clear all pending signals in the specified set.
       */
      void
      sigclear(sigpend_t *sp, const sigset_t *mask, ksiginfoq_t *kq)
      {
              ksiginfo_t *ksi, *next;
      
              if (mask == NULL)
                      sigemptyset(&sp->sp_set);
              else
                      sigminusset(mask, &sp->sp_set);
      
              TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, next) {
                      if (mask == NULL || sigismember(mask, ksi->ksi_signo)) {
                              TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list);
                              KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
                              KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0);
                              TAILQ_INSERT_TAIL(kq, ksi, ksi_list);
                      }
              }
      }
      
      /*
       * sigclearall:
       *
       *        Clear all pending signals in the specified set from a process and
       *        its LWPs.
       */
      void
      sigclearall(struct proc *p, const sigset_t *mask, ksiginfoq_t *kq)
      {
              struct lwp *l;
      
              KASSERT(mutex_owned(p->p_lock));
      
              sigclear(&p->p_sigpend, mask, kq);
      
              LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                      sigclear(&l->l_sigpend, mask, kq);
              }
      }
      
      /*
       * sigispending:
       *
       *        Return the first signal number if there are pending signals for the
       *        current LWP.  May be called unlocked provided that LW_PENDSIG is set,
       *        and that the signal has been posted to the appopriate queue before
       *        LW_PENDSIG is set.
       */ 
      int
      sigispending(struct lwp *l, int signo)
      {
              struct proc *p = l->l_proc;
              sigset_t tset;
      
              membar_consumer();
      
              tset = l->l_sigpend.sp_set;
              sigplusset(&p->p_sigpend.sp_set, &tset);
              sigminusset(&p->p_sigctx.ps_sigignore, &tset);
              sigminusset(&l->l_sigmask, &tset);
      
              if (signo == 0) {
                      return firstsig(&tset);
              }
              return sigismember(&tset, signo) ? signo : 0;
      }
      
      void
      getucontext(struct lwp *l, ucontext_t *ucp)
      {
              struct proc *p = l->l_proc;
      
              KASSERT(mutex_owned(p->p_lock));
      
              ucp->uc_flags = 0;
              ucp->uc_link = l->l_ctxlink;
              ucp->uc_sigmask = l->l_sigmask;
              ucp->uc_flags |= _UC_SIGMASK;
      
              /*
               * The (unsupplied) definition of the `current execution stack'
               * in the System V Interface Definition appears to allow returning
               * the main context stack.
               */
              if ((l->l_sigstk.ss_flags & SS_ONSTACK) == 0) {
                      ucp->uc_stack.ss_sp = (void *)l->l_proc->p_stackbase;
                      ucp->uc_stack.ss_size = ctob(l->l_proc->p_vmspace->vm_ssize);
                      ucp->uc_stack.ss_flags = 0;        /* XXX, def. is Very Fishy */
              } else {
                      /* Simply copy alternate signal execution stack. */
                      ucp->uc_stack = l->l_sigstk;
              }
              ucp->uc_flags |= _UC_STACK;
              mutex_exit(p->p_lock);
              cpu_getmcontext(l, &ucp->uc_mcontext, &ucp->uc_flags);
              mutex_enter(p->p_lock);
      }
      
      int
      setucontext(struct lwp *l, const ucontext_t *ucp)
      {
              struct proc *p = l->l_proc;
              int error;
      
              KASSERT(mutex_owned(p->p_lock));
      
              if ((ucp->uc_flags & _UC_SIGMASK) != 0) {
                      error = sigprocmask1(l, SIG_SETMASK, &ucp->uc_sigmask, NULL);
                      if (error != 0)
                              return error;
              }
      
              mutex_exit(p->p_lock);
              error = cpu_setmcontext(l, &ucp->uc_mcontext, ucp->uc_flags);
              mutex_enter(p->p_lock);
              if (error != 0)
                      return (error);
      
              l->l_ctxlink = ucp->uc_link;
      
              /*
               * If there was stack information, update whether or not we are
               * still running on an alternate signal stack.
               */
              if ((ucp->uc_flags & _UC_STACK) != 0) {
                      if (ucp->uc_stack.ss_flags & SS_ONSTACK)
                              l->l_sigstk.ss_flags |= SS_ONSTACK;
                      else
                              l->l_sigstk.ss_flags &= ~SS_ONSTACK;
              }
      
              return 0;
      }
      
      /*
       * killpg1: common code for kill process group/broadcast kill.
       */
      int
      killpg1(struct lwp *l, ksiginfo_t *ksi, int pgid, int all)
      {
              struct proc        *p, *cp;
              kauth_cred_t        pc;
              struct pgrp        *pgrp;
              int                nfound;
              int                signo = ksi->ksi_signo;
      
              cp = l->l_proc;
              pc = l->l_cred;
              nfound = 0;
      
              mutex_enter(proc_lock);
              if (all) {
                      /*
                       * Broadcast.
                       */
                      PROCLIST_FOREACH(p, &allproc) {
                              if (p->p_pid <= 1 || p == cp ||
                                  (p->p_flag & PK_SYSTEM) != 0)
                                      continue;
                              mutex_enter(p->p_lock);
                              if (kauth_authorize_process(pc,
                                  KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(signo), NULL,
                                  NULL) == 0) {
                                      nfound++;
                                      if (signo)
                                              kpsignal2(p, ksi);
                              }
                              mutex_exit(p->p_lock);
                      }
              } else {
                      if (pgid == 0)
                              /* Zero pgid means send to my process group. */
                              pgrp = cp->p_pgrp;
                      else {
                              pgrp = pgrp_find(pgid);
                              if (pgrp == NULL)
                                      goto out;
                      }
                      LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
                              if (p->p_pid <= 1 || p->p_flag & PK_SYSTEM)
                                      continue;
                              mutex_enter(p->p_lock);
                              if (kauth_authorize_process(pc, KAUTH_PROCESS_SIGNAL,
                                  p, KAUTH_ARG(signo), NULL, NULL) == 0) {
                                      nfound++;
                                      if (signo && P_ZOMBIE(p) == 0)
                                              kpsignal2(p, ksi);
                              }
                              mutex_exit(p->p_lock);
                      }
              }
      out:
              mutex_exit(proc_lock);
              return nfound ? 0 : ESRCH;
      }
      
      /*
       * Send a signal to a process group.  If checktty is set, limit to members
       * which have a controlling terminal.
       */
      void
      pgsignal(struct pgrp *pgrp, int sig, int checkctty)
      {
              ksiginfo_t ksi;
      
              KASSERT(!cpu_intr_p());
              KASSERT(mutex_owned(proc_lock));
      
              KSI_INIT_EMPTY(&ksi);
              ksi.ksi_signo = sig;
              kpgsignal(pgrp, &ksi, NULL, checkctty);
      }
      
      void
      kpgsignal(struct pgrp *pgrp, ksiginfo_t *ksi, void *data, int checkctty)
      {
              struct proc *p;
      
              KASSERT(!cpu_intr_p());
              KASSERT(mutex_owned(proc_lock));
              KASSERT(pgrp != NULL);
      
              LIST_FOREACH(p, &pgrp->pg_members, p_pglist)
                      if (checkctty == 0 || p->p_lflag & PL_CONTROLT)
                              kpsignal(p, ksi, data);
      }
      
      /*
       * Send a signal caused by a trap to the current LWP.  If it will be caught
       * immediately, deliver it with correct code.  Otherwise, post it normally.
       */
      void
      trapsignal(struct lwp *l, ksiginfo_t *ksi)
      {
              struct proc        *p;
              struct sigacts        *ps;
              int signo = ksi->ksi_signo;
              sigset_t *mask;
              sig_t action;
      
              KASSERT(KSI_TRAP_P(ksi));
      
              ksi->ksi_lid = l->l_lid;
              p = l->l_proc;
      
              KASSERT(!cpu_intr_p());
              mutex_enter(proc_lock);
              mutex_enter(p->p_lock);
      
              /*
               * If we are exiting, demise now.
               *
               * This avoids notifying tracer and deadlocking.
               */
              if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
                      mutex_exit(p->p_lock);
                      mutex_exit(proc_lock);
                      lwp_exit(l);
                      panic("trapsignal");
                      /* NOTREACHED */
              }
      
              mask = &l->l_sigmask;
              ps = p->p_sigacts;
              action = SIGACTION_PS(ps, signo).sa_handler;
      
              if (ISSET(p->p_slflag, PSL_TRACED) &&
                  !(p->p_pptr == p->p_opptr && ISSET(p->p_lflag, PL_PPWAIT)) &&
                  p->p_xsig != SIGKILL &&
                  !sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
                      p->p_xsig = signo;
                      p->p_sigctx.ps_faked = true;
                      p->p_sigctx.ps_lwp = ksi->ksi_lid;
                      p->p_sigctx.ps_info = ksi->ksi_info;
                      sigswitch(0, signo, false);
      
                      if (ktrpoint(KTR_PSIG)) {
                              if (p->p_emul->e_ktrpsig)
                                      p->p_emul->e_ktrpsig(signo, action, mask, ksi);
                              else
                                      ktrpsig(signo, action, mask, ksi);
                      }
                      return;
              }
      
              const bool caught = sigismember(&p->p_sigctx.ps_sigcatch, signo);
              const bool masked = sigismember(mask, signo);
              if (caught && !masked) {
                      mutex_exit(proc_lock);
                      l->l_ru.ru_nsignals++;
                      kpsendsig(l, ksi, mask);
                      mutex_exit(p->p_lock);
      
                      if (ktrpoint(KTR_PSIG)) {
                              if (p->p_emul->e_ktrpsig)
                                      p->p_emul->e_ktrpsig(signo, action, mask, ksi);
                              else
                                      ktrpsig(signo, action, mask, ksi);
                      }
                      return;
              }
      
              /*
               * If the signal is masked or ignored, then unmask it and
               * reset it to the default action so that the process or
               * its tracer will be notified.
               */
              const bool ignored = action == SIG_IGN;
              if (masked || ignored) {
                      mutex_enter(&ps->sa_mutex);
                      sigdelset(mask, signo);        
                      sigdelset(&p->p_sigctx.ps_sigcatch, signo);
                      sigdelset(&p->p_sigctx.ps_sigignore, signo);
                      sigdelset(&SIGACTION_PS(ps, signo).sa_mask, signo);
                      SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
                      mutex_exit(&ps->sa_mutex);
              }
      
              kpsignal2(p, ksi);
              mutex_exit(p->p_lock);
              mutex_exit(proc_lock);
      }
      
      /*
       * Fill in signal information and signal the parent for a child status change.
       */
      void
      child_psignal(struct proc *p, int mask)
      {
              ksiginfo_t ksi;
              struct proc *q;
              int xsig;
      
              KASSERT(mutex_owned(proc_lock));
              KASSERT(mutex_owned(p->p_lock));
      
              xsig = p->p_xsig;
      
              KSI_INIT(&ksi);
              ksi.ksi_signo = SIGCHLD;
              ksi.ksi_code = (xsig == SIGCONT ? CLD_CONTINUED : CLD_STOPPED);
              ksi.ksi_pid = p->p_pid;
              ksi.ksi_uid = kauth_cred_geteuid(p->p_cred);
              ksi.ksi_status = xsig;
              ksi.ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec;
              ksi.ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec;
      
              q = p->p_pptr;
      
              mutex_exit(p->p_lock);
              mutex_enter(q->p_lock);
      
              if ((q->p_sflag & mask) == 0)
                      kpsignal2(q, &ksi);
      
              mutex_exit(q->p_lock);
              mutex_enter(p->p_lock);
      }
      
      void
      psignal(struct proc *p, int signo)
      {
              ksiginfo_t ksi;
      
              KASSERT(!cpu_intr_p());
              KASSERT(mutex_owned(proc_lock));
      
              KSI_INIT_EMPTY(&ksi);
              ksi.ksi_signo = signo;
              mutex_enter(p->p_lock);
              kpsignal2(p, &ksi);
              mutex_exit(p->p_lock);
      }
      
      void
      kpsignal(struct proc *p, ksiginfo_t *ksi, void *data)
      {
              fdfile_t *ff;
              file_t *fp;
              fdtab_t *dt;
      
              KASSERT(!cpu_intr_p());
              KASSERT(mutex_owned(proc_lock));
      
              if ((p->p_sflag & PS_WEXIT) == 0 && data) {
                      size_t fd;
                      filedesc_t *fdp = p->p_fd;
      
                      /* XXXSMP locking */
                      ksi->ksi_fd = -1;
                      dt = fdp->fd_dt;
                      for (fd = 0; fd < dt->dt_nfiles; fd++) {
                              if ((ff = dt->dt_ff[fd]) == NULL)
                                      continue;
                              if ((fp = ff->ff_file) == NULL)
                                      continue;
                              if (fp->f_data == data) {
                                      ksi->ksi_fd = fd;
                                      break;
                              }
                      }
              }
              mutex_enter(p->p_lock);
              kpsignal2(p, ksi);
              mutex_exit(p->p_lock);
      }
      
      /*
       * sigismasked:
       *
       *        Returns true if signal is ignored or masked for the specified LWP.
       */
      int
      sigismasked(struct lwp *l, int sig)
      {
              struct proc *p = l->l_proc;
      
              return sigismember(&p->p_sigctx.ps_sigignore, sig) ||
                  sigismember(&l->l_sigmask, sig);
      }
      
      /*
       * sigpost:
       *
       *        Post a pending signal to an LWP.  Returns non-zero if the LWP may
       *        be able to take the signal.
       */
      static int
      sigpost(struct lwp *l, sig_t action, int prop, int sig)
      {
              int rv, masked;
              struct proc *p = l->l_proc;
      
              KASSERT(mutex_owned(p->p_lock));
      
              /*
               * If the LWP is on the way out, sigclear() will be busy draining all
               * pending signals.  Don't give it more.
               */
              if (l->l_refcnt == 0)
                      return 0;
      
              SDT_PROBE(proc, kernel, , signal__send, l, p, sig, 0, 0);
      
              /*
               * Have the LWP check for signals.  This ensures that even if no LWP
               * is found to take the signal immediately, it should be taken soon.
               */
              lwp_lock(l);
              l->l_flag |= LW_PENDSIG;
      
              /*
               * SIGCONT can be masked, but if LWP is stopped, it needs restart.
               * Note: SIGKILL and SIGSTOP cannot be masked.
               */
              masked = sigismember(&l->l_sigmask, sig);
              if (masked && ((prop & SA_CONT) == 0 || l->l_stat != LSSTOP)) {
                      lwp_unlock(l);
                      return 0;
              }
      
              /*
               * If killing the process, make it run fast.
               */
              if (__predict_false((prop & SA_KILL) != 0) &&
                  action == SIG_DFL && l->l_priority < MAXPRI_USER) {
                      KASSERT(l->l_class == SCHED_OTHER);
                      lwp_changepri(l, MAXPRI_USER);
              }
      
              /*
               * If the LWP is running or on a run queue, then we win.  If it's
               * sleeping interruptably, wake it and make it take the signal.  If
               * the sleep isn't interruptable, then the chances are it will get
               * to see the signal soon anyhow.  If suspended, it can't take the
               * signal right now.  If it's LWP private or for all LWPs, save it
               * for later; otherwise punt.
               */
              rv = 0;
      
              switch (l->l_stat) {
              case LSRUN:
              case LSONPROC:
                      lwp_need_userret(l);
                      rv = 1;
                      break;
      
              case LSSLEEP:
                      if ((l->l_flag & LW_SINTR) != 0) {
                              /* setrunnable() will release the lock. */
                              setrunnable(l);
                              return 1;
                      }
                      break;
      
              case LSSUSPENDED:
                      if ((prop & SA_KILL) != 0 && (l->l_flag & LW_WCORE) != 0) {
                              /* lwp_continue() will release the lock. */
                              lwp_continue(l);
                              return 1;
                      }
                      break;
      
              case LSSTOP:
                      if ((prop & SA_STOP) != 0)
                              break;
      
                      /*
                       * If the LWP is stopped and we are sending a continue
                       * signal, then start it again.
                       */
                      if ((prop & SA_CONT) != 0) {
                              if (l->l_wchan != NULL) {
                                      l->l_stat = LSSLEEP;
                                      p->p_nrlwps++;
                                      rv = 1;
                                      break;
                              }
                              /* setrunnable() will release the lock. */
                              setrunnable(l);
                              return 1;
                      } else if (l->l_wchan == NULL || (l->l_flag & LW_SINTR) != 0) {
                              /* setrunnable() will release the lock. */
                              setrunnable(l);
                              return 1;
                      }
                      break;
      
              default:
                      break;
              }
      
              lwp_unlock(l);
              return rv;
      }
      
      /*
       * Notify an LWP that it has a pending signal.
       */
      void
      signotify(struct lwp *l)
      {
              KASSERT(lwp_locked(l, NULL));
      
              l->l_flag |= LW_PENDSIG;
              lwp_need_userret(l);
      }
      
      /*
       * Find an LWP within process p that is waiting on signal ksi, and hand
       * it on.
       */
      static int
      sigunwait(struct proc *p, const ksiginfo_t *ksi)
      {
              struct lwp *l;
              int signo;
      
              KASSERT(mutex_owned(p->p_lock));
      
              signo = ksi->ksi_signo;
      
              if (ksi->ksi_lid != 0) {
                      /*
                       * Signal came via _lwp_kill().  Find the LWP and see if
                       * it's interested.
                       */
                      if ((l = lwp_find(p, ksi->ksi_lid)) == NULL)
                              return 0;
                      if (l->l_sigwaited == NULL ||
                          !sigismember(&l->l_sigwaitset, signo))
                              return 0;
              } else {
                      /*
                       * Look for any LWP that may be interested.
                       */
                      LIST_FOREACH(l, &p->p_sigwaiters, l_sigwaiter) {
                              KASSERT(l->l_sigwaited != NULL);
                              if (sigismember(&l->l_sigwaitset, signo))
                                      break;
                      }
              }
      
              if (l != NULL) {
                      l->l_sigwaited->ksi_info = ksi->ksi_info;
                      l->l_sigwaited = NULL;
                      LIST_REMOVE(l, l_sigwaiter);
                      cv_signal(&l->l_sigcv);
                      return 1;
              }
      
              return 0;
      }
      
      /*
       * Send the signal to the process.  If the signal has an action, the action
       * is usually performed by the target process rather than the caller; we add
       * the signal to the set of pending signals for the process.
       *
       * Exceptions:
       *   o When a stop signal is sent to a sleeping process that takes the
       *     default action, the process is stopped without awakening it.
       *   o SIGCONT restarts stopped processes (or puts them back to sleep)
       *     regardless of the signal action (eg, blocked or ignored).
       *
       * Other ignored signals are discarded immediately.
       */
      int
      kpsignal2(struct proc *p, ksiginfo_t *ksi)
      {
              int prop, signo = ksi->ksi_signo;
              struct lwp *l = NULL;
              ksiginfo_t *kp;
              lwpid_t lid;
              sig_t action;
              bool toall;
              int error = 0;
      
              KASSERT(!cpu_intr_p());
              KASSERT(mutex_owned(proc_lock));
              KASSERT(mutex_owned(p->p_lock));
              KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0);
              KASSERT(signo > 0 && signo < NSIG);
      
              /*
               * If the process is being created by fork, is a zombie or is
               * exiting, then just drop the signal here and bail out.
               */
              if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
                      return 0;
      
              /* XXX for core dump/debugger */
              p->p_sigctx.ps_lwp = ksi->ksi_lid;
              p->p_sigctx.ps_info = ksi->ksi_info;
      
              /*
               * Notify any interested parties of the signal.
               */
              KNOTE(&p->p_klist, NOTE_SIGNAL | signo);
      
              /*
               * Some signals including SIGKILL must act on the entire process.
               */
              kp = NULL;
              prop = sigprop[signo];
              toall = ((prop & SA_TOALL) != 0);
              lid = toall ? 0 : ksi->ksi_lid;
      
              /*
               * If proc is traced, always give parent a chance.
               */
              if (p->p_slflag & PSL_TRACED) {
                      action = SIG_DFL;
      
                      if (lid == 0) {
                              /*
                               * If the process is being traced and the signal
                               * is being caught, make sure to save any ksiginfo.
                               */
                              if ((kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
                                      goto discard;
                              if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
                                      goto out;
                      }
              } else {
      
                      /*
                       * If the signal is being ignored, then drop it.  Note: we
                       * don't set SIGCONT in ps_sigignore, and if it is set to
                       * SIG_IGN, action will be SIG_DFL here.
                       */
                      if (sigismember(&p->p_sigctx.ps_sigignore, signo))
                              goto discard;
      
                      else if (sigismember(&p->p_sigctx.ps_sigcatch, signo))
                              action = SIG_CATCH;
                      else {
                              action = SIG_DFL;
      
                              /*
                               * If sending a tty stop signal to a member of an
                               * orphaned process group, discard the signal here if
                               * the action is default; don't stop the process below
                               * if sleeping, and don't clear any pending SIGCONT.
                               */
                              if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0)
                                      goto discard;
      
                              if (prop & SA_KILL && p->p_nice > NZERO)
                                      p->p_nice = NZERO;
                      }
              }
      
              /*
               * If stopping or continuing a process, discard any pending
               * signals that would do the inverse.
               */
              if ((prop & (SA_CONT | SA_STOP)) != 0) {
                      ksiginfoq_t kq;
      
                      ksiginfo_queue_init(&kq);
                      if ((prop & SA_CONT) != 0)
                              sigclear(&p->p_sigpend, &stopsigmask, &kq);
                      if ((prop & SA_STOP) != 0)
                              sigclear(&p->p_sigpend, &contsigmask, &kq);
                      ksiginfo_queue_drain(&kq);        /* XXXSMP */
              }
      
              /*
               * If the signal doesn't have SA_CANTMASK (no override for SIGKILL,
               * please!), check if any LWPs are waiting on it.  If yes, pass on
               * the signal info.  The signal won't be processed further here.
               */
              if ((prop & SA_CANTMASK) == 0 && !LIST_EMPTY(&p->p_sigwaiters) &&
                  p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0 &&
                  sigunwait(p, ksi))
                      goto discard;
      
              /*
               * XXXSMP Should be allocated by the caller, we're holding locks
               * here.
               */
              if (kp == NULL && (kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
                      goto discard;
      
              /*
               * LWP private signals are easy - just find the LWP and post
               * the signal to it.
               */
              if (lid != 0) {
                      l = lwp_find(p, lid);
                      if (l != NULL) {
                              if ((error = sigput(&l->l_sigpend, p, kp)) != 0)
                                      goto out;
                              membar_producer();
                              if (sigpost(l, action, prop, kp->ksi_signo) != 0)
                                      signo = -1;
                      }
                      goto out;
              }
      
              /*
               * Some signals go to all LWPs, even if posted with _lwp_kill()
               * or for an SA process.
               */
              if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) {
                      if ((p->p_slflag & PSL_TRACED) != 0)
                              goto deliver;
      
                      /*
                       * If SIGCONT is default (or ignored) and process is
                       * asleep, we are finished; the process should not
                       * be awakened.
                       */
                      if ((prop & SA_CONT) != 0 && action == SIG_DFL)
                              goto out;
              } else {
                      /*
                       * Process is stopped or stopping.
                       * - If traced, then no action is needed, unless killing.
                       * - Run the process only if sending SIGCONT or SIGKILL.
                       */
                      if ((p->p_slflag & PSL_TRACED) != 0 && signo != SIGKILL) {
                              goto out;
                      }
                      if ((prop & SA_CONT) != 0 || signo == SIGKILL) {
                              /*
                               * Re-adjust p_nstopchild if the process was
                               * stopped but not yet collected by its parent.
                               */
                              if (p->p_stat == SSTOP && !p->p_waited)
                                      p->p_pptr->p_nstopchild--;
                              p->p_stat = SACTIVE;
                              p->p_sflag &= ~PS_STOPPING;
                              if (p->p_slflag & PSL_TRACED) {
                                      KASSERT(signo == SIGKILL);
                                      goto deliver;
                              }
                              /*
                               * Do not make signal pending if SIGCONT is default.
                               *
                               * If the process catches SIGCONT, let it handle the
                               * signal itself (if waiting on event - process runs,
                               * otherwise continues sleeping).
                               */
                              if ((prop & SA_CONT) != 0) {
                                      p->p_xsig = SIGCONT;
                                      p->p_sflag |= PS_CONTINUED;
                                      child_psignal(p, 0);
                                      if (action == SIG_DFL) {
                                              KASSERT(signo != SIGKILL);
                                              goto deliver;
                                      }
                              }
                      } else if ((prop & SA_STOP) != 0) {
                              /*
                               * Already stopped, don't need to stop again.
                               * (If we did the shell could get confused.)
                               */
                              goto out;
                      }
              }
              /*
               * Make signal pending.
               */
              KASSERT((p->p_slflag & PSL_TRACED) == 0);
              if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
                      goto out;
      deliver:
              /*
               * Before we set LW_PENDSIG on any LWP, ensure that the signal is
               * visible on the per process list (for sigispending()).  This
               * is unlikely to be needed in practice, but...
               */
              membar_producer();
      
              /*
               * Try to find an LWP that can take the signal.
               */
              LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                      if (sigpost(l, action, prop, kp->ksi_signo) && !toall)
                              break;
              }
              signo = -1;
      out:
              /*
               * If the ksiginfo wasn't used, then bin it.  XXXSMP freeing memory
               * with locks held.  The caller should take care of this.
               */
              ksiginfo_free(kp);
              if (signo == -1)
                      return error;
      discard:
              SDT_PROBE(proc, kernel, , signal__discard, l, p, signo, 0, 0);
              return error;
      }
      
      void
      kpsendsig(struct lwp *l, const ksiginfo_t *ksi, const sigset_t *mask)
      {
              struct proc *p = l->l_proc;
      
              KASSERT(mutex_owned(p->p_lock));
              (*p->p_emul->e_sendsig)(ksi, mask);
      }
      
      /*
       * Stop any LWPs sleeping interruptably.
       */
      static void
      proc_stop_lwps(struct proc *p)
      {
              struct lwp *l;
      
              KASSERT(mutex_owned(p->p_lock));
              KASSERT((p->p_sflag & PS_STOPPING) != 0);
      
              LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                      lwp_lock(l);
                      if (l->l_stat == LSSLEEP && (l->l_flag & LW_SINTR) != 0) {
                              l->l_stat = LSSTOP;
                              p->p_nrlwps--;
                      }
                      lwp_unlock(l);
              }
      }
      
      /*
       * Finish stopping of a process.  Mark it stopped and notify the parent.
       *
       * Drop p_lock briefly if ppsig is true.
       */
      static void
      proc_stop_done(struct proc *p, int ppmask)
      {
      
              KASSERT(mutex_owned(proc_lock));
              KASSERT(mutex_owned(p->p_lock));
              KASSERT((p->p_sflag & PS_STOPPING) != 0);
              KASSERT(p->p_nrlwps == 0 || (p->p_nrlwps == 1 && p == curproc));
      
              p->p_sflag &= ~PS_STOPPING;
              p->p_stat = SSTOP;
              p->p_waited = 0;
              p->p_pptr->p_nstopchild++;
      
              /* child_psignal drops p_lock briefly. */
              child_psignal(p, ppmask);
              cv_broadcast(&p->p_pptr->p_waitcv);
      }
      
      /*
       * Stop the current process and switch away to the debugger notifying
       * an event specific to a traced process only.
       */
      void
      eventswitch(int code)
      {
              struct lwp *l = curlwp;
              struct proc *p = l->l_proc;
              struct sigacts *ps;
              sigset_t *mask;
              sig_t action;
              ksiginfo_t ksi;
              const int signo = SIGTRAP;
      
              KASSERT(mutex_owned(proc_lock));
              KASSERT(mutex_owned(p->p_lock));
              KASSERT(p->p_pptr != initproc);
              KASSERT(l->l_stat == LSONPROC);
              KASSERT(ISSET(p->p_slflag, PSL_TRACED));
              KASSERT(!ISSET(l->l_flag, LW_SYSTEM));
              KASSERT(p->p_nrlwps > 0);
              KASSERT((code == TRAP_CHLD) || (code == TRAP_LWP) ||
                      (code == TRAP_EXEC));
      
              /*
               * If we are exiting, demise now.
               *
               * This avoids notifying tracer and deadlocking.
              */
              if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
                      mutex_exit(p->p_lock);
                      mutex_exit(proc_lock);
                      lwp_exit(l);
                      panic("eventswitch");
                      /* NOTREACHED */
              }
      
              /*
               * If there's a pending SIGKILL process it immediately.
               */
              if (p->p_xsig == SIGKILL ||
                  sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
                      mutex_exit(p->p_lock);
                      mutex_exit(proc_lock);
                      return;
              }
      
              KSI_INIT_TRAP(&ksi);
              ksi.ksi_lid = l->l_lid;
              ksi.ksi_info._signo = signo;
              ksi.ksi_info._code = code;
      
              /* Needed for ktrace */
              ps = p->p_sigacts;
              action = SIGACTION_PS(ps, signo).sa_handler;
              mask = &l->l_sigmask;
      
              p->p_xsig = signo;
              p->p_sigctx.ps_faked = true;
              p->p_sigctx.ps_lwp = ksi.ksi_lid;
              p->p_sigctx.ps_info = ksi.ksi_info;
      
              sigswitch(0, signo, false);
      
              if (code == TRAP_CHLD) {
                      mutex_enter(proc_lock);
                      while (l->l_vforkwaiting)
                              cv_wait(&l->l_waitcv, proc_lock);
                      mutex_exit(proc_lock);
              }
      
              if (ktrpoint(KTR_PSIG)) {
                      if (p->p_emul->e_ktrpsig)
                              p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
                      else
                              ktrpsig(signo, action, mask, &ksi);
              }
      }
      
      /*
       * Stop the current process and switch away when being stopped or traced.
       */
      void
      sigswitch(int ppmask, int signo, bool relock)
      {
              struct lwp *l = curlwp;
              struct proc *p = l->l_proc;
              int biglocks;
      
              KASSERT(mutex_owned(p->p_lock));
              KASSERT(l->l_stat == LSONPROC);
              KASSERT(p->p_nrlwps > 0);
      
              /*
               * If we are exiting, demise now.
               *
               * This avoids notifying tracer and deadlocking.
               */
              if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
                      mutex_exit(p->p_lock);
                      if (!relock) {
                              mutex_exit(proc_lock);
                      }
                      lwp_exit(l);
                      panic("sigswitch");
                      /* NOTREACHED */
              }
      
              /*
               * On entry we know that the process needs to stop.  If it's
               * the result of a 'sideways' stop signal that has been sourced
               * through issignal(), then stop other LWPs in the process too.
               */
              if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) {
                      KASSERT(signo != 0);
                      proc_stop(p, signo);
                      KASSERT(p->p_nrlwps > 0);
              }
      
              /*
               * If we are the last live LWP, and the stop was a result of
               * a new signal, then signal the parent.
               */
              if ((p->p_sflag & PS_STOPPING) != 0) {
                      if (relock && !mutex_tryenter(proc_lock)) {
                              mutex_exit(p->p_lock);
                              mutex_enter(proc_lock);
                              mutex_enter(p->p_lock);
                      }
      
                      if (p->p_nrlwps == 1 && (p->p_sflag & PS_STOPPING) != 0) {
                              /*
                               * Note that proc_stop_done() can drop
                               * p->p_lock briefly.
                               */
                              proc_stop_done(p, ppmask);
                      }
      
                      mutex_exit(proc_lock);
              }
      
              /*
               * Unlock and switch away.
               */
              KERNEL_UNLOCK_ALL(l, &biglocks);
              if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
                      p->p_nrlwps--;
                      lwp_lock(l);
                      KASSERT(l->l_stat == LSONPROC || l->l_stat == LSSLEEP);
                      l->l_stat = LSSTOP;
                      lwp_unlock(l);
              }
      
              mutex_exit(p->p_lock);
              lwp_lock(l);
              mi_switch(l);
              KERNEL_LOCK(biglocks, l);
      }
      
      /*
       * Check for a signal from the debugger.
       */
      static int
      sigchecktrace(void)
      {
              struct lwp *l = curlwp;
              struct proc *p = l->l_proc;
              int signo;
      
              KASSERT(mutex_owned(p->p_lock));
      
              /* If there's a pending SIGKILL, process it immediately. */
              if (sigismember(&p->p_sigpend.sp_set, SIGKILL))
                      return 0;
      
              /*
               * If we are no longer being traced, or the parent didn't
               * give us a signal, or we're stopping, look for more signals.
               */
              if ((p->p_slflag & PSL_TRACED) == 0 || p->p_xsig == 0 ||
                  (p->p_sflag & PS_STOPPING) != 0)
                      return 0;
      
              /*
               * If the new signal is being masked, look for other signals.
               * `p->p_sigctx.ps_siglist |= mask' is done in setrunnable().
               */
              signo = p->p_xsig;
              p->p_xsig = 0;
              if (sigismember(&l->l_sigmask, signo)) {
                      signo = 0;
              }
              return signo;
      }
      
      /*
       * If the current process has received a signal (should be caught or cause
       * termination, should interrupt current syscall), return the signal number.
       *
       * Stop signals with default action are processed immediately, then cleared;
       * they aren't returned.  This is checked after each entry to the system for
       * a syscall or trap.
       *
       * We will also return -1 if the process is exiting and the current LWP must
       * follow suit.
       */
      int
      issignal(struct lwp *l)
      {
              struct proc *p;
              int signo, prop;
              sigpend_t *sp;
              sigset_t ss;
      
              p = l->l_proc;
              sp = NULL;
              signo = 0;
      
              KASSERT(p == curproc);
              KASSERT(mutex_owned(p->p_lock));
      
              for (;;) {
                      /* Discard any signals that we have decided not to take. */
                      if (signo != 0) {
                              (void)sigget(sp, NULL, signo, NULL);
                      }
      
                      /*
                       * If the process is stopped/stopping, then stop ourselves
                       * now that we're on the kernel/userspace boundary.  When
                       * we awaken, check for a signal from the debugger.
                       */
                      if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
                              sigswitch(PS_NOCLDSTOP, 0, true);
                              mutex_enter(p->p_lock);
                              signo = sigchecktrace();
                      } else if (p->p_stat == SACTIVE)
                              signo = sigchecktrace();
                      else
                              signo = 0;
      
                      /* Signals from the debugger are "out of band". */
                      sp = NULL;
      
                      /*
                       * If the debugger didn't provide a signal, find a pending
                       * signal from our set.  Check per-LWP signals first, and
                       * then per-process.
                       */
                      if (signo == 0) {
                              sp = &l->l_sigpend;
                              ss = sp->sp_set;
                              if ((p->p_lflag & PL_PPWAIT) != 0)
                                      sigminusset(&vforksigmask, &ss);
                              sigminusset(&l->l_sigmask, &ss);
      
                              if ((signo = firstsig(&ss)) == 0) {
                                      sp = &p->p_sigpend;
                                      ss = sp->sp_set;
                                      if ((p->p_lflag & PL_PPWAIT) != 0)
                                              sigminusset(&vforksigmask, &ss);
                                      sigminusset(&l->l_sigmask, &ss);
      
                                      if ((signo = firstsig(&ss)) == 0) {
                                              /*
                                               * No signal pending - clear the
                                               * indicator and bail out.
                                               */
                                              lwp_lock(l);
                                              l->l_flag &= ~LW_PENDSIG;
                                              lwp_unlock(l);
                                              sp = NULL;
                                              break;
                                      }
                              }
                      }
      
                      /*
                       * We should see pending but ignored signals only if
                       * we are being traced.
                       */
                      if (sigismember(&p->p_sigctx.ps_sigignore, signo) &&
                          (p->p_slflag & PSL_TRACED) == 0) {
                              /* Discard the signal. */
                              continue;
                      }
      
                      /*
                       * If traced, always stop, and stay stopped until released
                       * by the debugger.  If the our parent is our debugger waiting
                       * for us and we vforked, don't hang as we could deadlock.
                       */
                      if (ISSET(p->p_slflag, PSL_TRACED) && signo != SIGKILL &&
                          !(ISSET(p->p_lflag, PL_PPWAIT) &&
                           (p->p_pptr == p->p_opptr))) {
                              /*
                               * Take the signal, but don't remove it from the
                               * siginfo queue, because the debugger can send
                               * it later.
                               */
                              if (sp)
                                      sigdelset(&sp->sp_set, signo);
                              p->p_xsig = signo;
      
                              /* Handling of signal trace */
                              sigswitch(0, signo, true);
                              mutex_enter(p->p_lock);
      
                              /* Check for a signal from the debugger. */
                              if ((signo = sigchecktrace()) == 0)
                                      continue;
      
                              /* Signals from the debugger are "out of band". */
                              sp = NULL;
                      }
      
                      prop = sigprop[signo];
      
                      /*
                       * Decide whether the signal should be returned.
                       */
                      switch ((long)SIGACTION(p, signo).sa_handler) {
                      case (long)SIG_DFL:
                              /*
                               * Don't take default actions on system processes.
                               */
                              if (p->p_pid <= 1) {
      #ifdef DIAGNOSTIC
                                      /*
                                       * Are you sure you want to ignore SIGSEGV
                                       * in init? XXX
                                       */
                                      printf_nolog("Process (pid %d) got sig %d\n",
                                          p->p_pid, signo);
      #endif
                                      continue;
                              }
      
                              /*
                               * If there is a pending stop signal to process with
                               * default action, stop here, then clear the signal. 
                               * However, if process is member of an orphaned
                               * process group, ignore tty stop signals.
                               */
                              if (prop & SA_STOP) {
                                      /*
                                       * XXX Don't hold proc_lock for p_lflag,
                                       * but it's not a big deal.
                                       */
                                      if ((ISSET(p->p_slflag, PSL_TRACED) &&
                                           !(ISSET(p->p_lflag, PL_PPWAIT) &&
                                           (p->p_pptr == p->p_opptr))) ||
                                          ((p->p_lflag & PL_ORPHANPG) != 0 &&
                                          prop & SA_TTYSTOP)) {
                                              /* Ignore the signal. */
                                              continue;
                                      }
                                      /* Take the signal. */
                                      (void)sigget(sp, NULL, signo, NULL);
                                      p->p_xsig = signo;
                                      p->p_sflag &= ~PS_CONTINUED;
                                      signo = 0;
                                      sigswitch(PS_NOCLDSTOP, p->p_xsig, true);
                                      mutex_enter(p->p_lock);
                              } else if (prop & SA_IGNORE) {
                                      /*
                                       * Except for SIGCONT, shouldn't get here.
                                       * Default action is to ignore; drop it.
                                       */
                                      continue;
                              }
                              break;
      
                      case (long)SIG_IGN:
      #ifdef DEBUG_ISSIGNAL
                              /*
                               * Masking above should prevent us ever trying
                               * to take action on an ignored signal other
                               * than SIGCONT, unless process is traced.
                               */
                              if ((prop & SA_CONT) == 0 &&
                                  (p->p_slflag & PSL_TRACED) == 0)
                                      printf_nolog("issignal\n");
      #endif
                              continue;
      
                      default:
                              /*
                               * This signal has an action, let postsig() process
                               * it.
                               */
                              break;
                      }
      
                      break;
              }
      
              l->l_sigpendset = sp;
              return signo;
      }
      
      /*
       * Take the action for the specified signal
       * from the current set of pending signals.
       */
      void
      postsig(int signo)
      {
              struct lwp        *l;
              struct proc        *p;
              struct sigacts        *ps;
              sig_t                action;
              sigset_t        *returnmask;
              ksiginfo_t        ksi;
      
              l = curlwp;
              p = l->l_proc;
              ps = p->p_sigacts;
      
              KASSERT(mutex_owned(p->p_lock));
              KASSERT(signo > 0);
      
              /*
               * Set the new mask value and also defer further occurrences of this
               * signal.
               *
               * Special case: user has done a sigsuspend.  Here the current mask is
               * not of interest, but rather the mask from before the sigsuspend is
               * what we want restored after the signal processing is completed.
               */
              if (l->l_sigrestore) {
                      returnmask = &l->l_sigoldmask;
                      l->l_sigrestore = 0;
              } else
                      returnmask = &l->l_sigmask;
      
              /*
               * Commit to taking the signal before releasing the mutex.
               */
              action = SIGACTION_PS(ps, signo).sa_handler;
              l->l_ru.ru_nsignals++;
              if (l->l_sigpendset == NULL) {
                      /* From the debugger */
                      if (p->p_sigctx.ps_faked &&
                          signo == p->p_sigctx.ps_info._signo) {
                              KSI_INIT(&ksi);
                              ksi.ksi_info = p->p_sigctx.ps_info;
                              ksi.ksi_lid = p->p_sigctx.ps_lwp;
                              p->p_sigctx.ps_faked = false;
                      } else {
                              if (!siggetinfo(&l->l_sigpend, &ksi, signo))
                                      (void)siggetinfo(&p->p_sigpend, &ksi, signo);
                      }
              } else
                      sigget(l->l_sigpendset, &ksi, signo, NULL);
      
              if (ktrpoint(KTR_PSIG)) {
                      mutex_exit(p->p_lock);
                      if (p->p_emul->e_ktrpsig)
                              p->p_emul->e_ktrpsig(signo, action,
                                  returnmask, &ksi);
                      else
                              ktrpsig(signo, action, returnmask, &ksi);
                      mutex_enter(p->p_lock);
              }
      
              SDT_PROBE(proc, kernel, , signal__handle, signo, &ksi, action, 0, 0);
      
              if (action == SIG_DFL) {
                      /*
                       * Default action, where the default is to kill
                       * the process.  (Other cases were ignored above.)
                       */
                      sigexit(l, signo);
                      return;
              }
      
              /*
               * If we get here, the signal must be caught.
               */
      #ifdef DIAGNOSTIC
              if (action == SIG_IGN || sigismember(&l->l_sigmask, signo))
                      panic("postsig action");
      #endif
      
              kpsendsig(l, &ksi, returnmask);
      }
      
      /*
       * sendsig:
       *
       *        Default signal delivery method for NetBSD.
       */
      void
      sendsig(const struct ksiginfo *ksi, const sigset_t *mask)
      {
              struct sigacts *sa;
              int sig;
      
              sig = ksi->ksi_signo;
              sa = curproc->p_sigacts;
      
              switch (sa->sa_sigdesc[sig].sd_vers)  {
              case 0:
              case 1:
                      /* Compat for 1.6 and earlier. */
                      if (sendsig_sigcontext_vec == NULL) {
                              break;
                      }
                      (*sendsig_sigcontext_vec)(ksi, mask);
                      return;
              case 2:
              case 3:
                      sendsig_siginfo(ksi, mask);
                      return;
              default:
                      break;
              }
      
              printf("sendsig: bad version %d\n", sa->sa_sigdesc[sig].sd_vers);
              sigexit(curlwp, SIGILL);
      }
      
      /*
       * sendsig_reset:
       *
       *        Reset the signal action.  Called from emulation specific sendsig()
       *        before unlocking to deliver the signal.
       */
      void
      sendsig_reset(struct lwp *l, int signo)
      {
              struct proc *p = l->l_proc;
              struct sigacts *ps = p->p_sigacts;
      
              KASSERT(mutex_owned(p->p_lock));
      
              p->p_sigctx.ps_lwp = 0;
              memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));
      
              mutex_enter(&ps->sa_mutex);
              sigplusset(&SIGACTION_PS(ps, signo).sa_mask, &l->l_sigmask);
              if (SIGACTION_PS(ps, signo).sa_flags & SA_RESETHAND) {
                      sigdelset(&p->p_sigctx.ps_sigcatch, signo);
                      if (signo != SIGCONT && sigprop[signo] & SA_IGNORE)
                              sigaddset(&p->p_sigctx.ps_sigignore, signo);
                      SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
              }
              mutex_exit(&ps->sa_mutex);
      }
      
      /*
       * Kill the current process for stated reason.
       */
      void
      killproc(struct proc *p, const char *why)
      {
      
              KASSERT(mutex_owned(proc_lock));
      
              log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why);
              uprintf_locked("sorry, pid %d was killed: %s\n", p->p_pid, why);
              psignal(p, SIGKILL);
      }
      
      /*
       * Force the current process to exit with the specified signal, dumping core
       * if appropriate.  We bypass the normal tests for masked and caught
       * signals, allowing unrecoverable failures to terminate the process without
       * changing signal state.  Mark the accounting record with the signal
       * termination.  If dumping core, save the signal number for the debugger. 
       * Calls exit and does not return.
       */
      void
      sigexit(struct lwp *l, int signo)
      {
              int exitsig, error, docore;
              struct proc *p;
              struct lwp *t;
      
              p = l->l_proc;
      
              KASSERT(mutex_owned(p->p_lock));
              KERNEL_UNLOCK_ALL(l, NULL);
      
              /*
               * Don't permit coredump() multiple times in the same process.
               * Call back into sigexit, where we will be suspended until
               * the deed is done.  Note that this is a recursive call, but
               * LW_WCORE will prevent us from coming back this way.
               */
              if ((p->p_sflag & PS_WCORE) != 0) {
                      lwp_lock(l);
                      l->l_flag |= (LW_WCORE | LW_WEXIT | LW_WSUSPEND);
                      lwp_unlock(l);
                      mutex_exit(p->p_lock);
                      lwp_userret(l);
                      panic("sigexit 1");
                      /* NOTREACHED */
              }
      
              /* If process is already on the way out, then bail now. */
              if ((p->p_sflag & PS_WEXIT) != 0) {
                      mutex_exit(p->p_lock);
                      lwp_exit(l);
                      panic("sigexit 2");
                      /* NOTREACHED */
              }
      
              /*
               * Prepare all other LWPs for exit.  If dumping core, suspend them
               * so that their registers are available long enough to be dumped.
                */
              if ((docore = (sigprop[signo] & SA_CORE)) != 0) {
                      p->p_sflag |= PS_WCORE;
                      for (;;) {
                              LIST_FOREACH(t, &p->p_lwps, l_sibling) {
                                      lwp_lock(t);
                                      if (t == l) {
                                              t->l_flag &= ~LW_WSUSPEND;
                                              lwp_unlock(t);
                                              continue;
                                      }
                                      t->l_flag |= (LW_WCORE | LW_WEXIT);
                                      lwp_suspend(l, t);
                              }
      
                              if (p->p_nrlwps == 1)
                                      break;
      
                              /*
                               * Kick any LWPs sitting in lwp_wait1(), and wait
                               * for everyone else to stop before proceeding.
                               */
                              p->p_nlwpwait++;
                              cv_broadcast(&p->p_lwpcv);
                              cv_wait(&p->p_lwpcv, p->p_lock);
                              p->p_nlwpwait--;
                      }
              }
      
              exitsig = signo;
              p->p_acflag |= AXSIG;
              memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));
              p->p_sigctx.ps_info._signo = signo;
              p->p_sigctx.ps_info._code = SI_NOINFO;
      
              if (docore) {
                      mutex_exit(p->p_lock);
                      error = (*coredump_vec)(l, NULL);
      
                      if (kern_logsigexit) {
                              int uid = l->l_cred ?
                                  (int)kauth_cred_geteuid(l->l_cred) : -1;
      
                              if (error)
                                      log(LOG_INFO, lognocoredump, p->p_pid,
                                          p->p_comm, uid, signo, error);
                              else
                                      log(LOG_INFO, logcoredump, p->p_pid,
                                          p->p_comm, uid, signo);
                      }
      
      #ifdef PAX_SEGVGUARD
                      pax_segvguard(l, p->p_textvp, p->p_comm, true);
      #endif /* PAX_SEGVGUARD */
                      /* Acquire the sched state mutex.  exit1() will release it. */
                      mutex_enter(p->p_lock);
                      if (error == 0)
                              p->p_sflag |= PS_COREDUMP;
              }
      
              /* No longer dumping core. */
              p->p_sflag &= ~PS_WCORE;
      
              exit1(l, 0, exitsig);
              /* NOTREACHED */
      }
      
      /*
       * Put process 'p' into the stopped state and optionally, notify the parent.
       */
      void
      proc_stop(struct proc *p, int signo)
      {
              struct lwp *l;
      
              KASSERT(mutex_owned(p->p_lock));
      
              /*
               * First off, set the stopping indicator and bring all sleeping
               * LWPs to a halt so they are included in p->p_nrlwps.  We musn't
               * unlock between here and the p->p_nrlwps check below.
               */
              p->p_sflag |= PS_STOPPING;
              membar_producer();
      
              proc_stop_lwps(p);
      
              /*
               * If there are no LWPs available to take the signal, then we
               * signal the parent process immediately.  Otherwise, the last
               * LWP to stop will take care of it.
               */
      
              if (p->p_nrlwps == 0) {
                      proc_stop_done(p, PS_NOCLDSTOP);
              } else {
                      /*
                       * Have the remaining LWPs come to a halt, and trigger
                       * proc_stop_callout() to ensure that they do.
                       */
                      LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                              sigpost(l, SIG_DFL, SA_STOP, signo);
                      }
                      callout_schedule(&proc_stop_ch, 1);
              }
      }
      
      /*
       * When stopping a process, we do not immediatly set sleeping LWPs stopped,
       * but wait for them to come to a halt at the kernel-user boundary.  This is
       * to allow LWPs to release any locks that they may hold before stopping.
       *
       * Non-interruptable sleeps can be long, and there is the potential for an
       * LWP to begin sleeping interruptably soon after the process has been set
       * stopping (PS_STOPPING).  These LWPs will not notice that the process is
       * stopping, and so complete halt of the process and the return of status
       * information to the parent could be delayed indefinitely.
       *
       * To handle this race, proc_stop_callout() runs once per tick while there
       * are stopping processes in the system.  It sets LWPs that are sleeping
       * interruptably into the LSSTOP state.
       *
       * Note that we are not concerned about keeping all LWPs stopped while the
       * process is stopped: stopped LWPs can awaken briefly to handle signals. 
       * What we do need to ensure is that all LWPs in a stopping process have
       * stopped at least once, so that notification can be sent to the parent
       * process.
       */
      static void
      proc_stop_callout(void *cookie)
      {
              bool more, restart;
              struct proc *p;
      
              (void)cookie;
      
              do {
                      restart = false;
                      more = false;
      
                      mutex_enter(proc_lock);
                      PROCLIST_FOREACH(p, &allproc) {
                              mutex_enter(p->p_lock);
      
                              if ((p->p_sflag & PS_STOPPING) == 0) {
                                      mutex_exit(p->p_lock);
                                      continue;
                              }
      
                              /* Stop any LWPs sleeping interruptably. */
                              proc_stop_lwps(p);
                              if (p->p_nrlwps == 0) {
                                      /*
                                       * We brought the process to a halt.
                                       * Mark it as stopped and notify the
                                       * parent.
                                       *
                                       * Note that proc_stop_done() will
                                       * drop p->p_lock briefly.
                                       * Arrange to restart and check
                                       * all processes again.
                                       */
                                      restart = true;
                                      proc_stop_done(p, PS_NOCLDSTOP);
                              } else
                                      more = true;
      
                              mutex_exit(p->p_lock);
                              if (restart)
                                      break;
                      }
                      mutex_exit(proc_lock);
              } while (restart);
      
              /*
               * If we noted processes that are stopping but still have
               * running LWPs, then arrange to check again in 1 tick.
               */
              if (more)
                      callout_schedule(&proc_stop_ch, 1);
      }
      
      /*
       * Given a process in state SSTOP, set the state back to SACTIVE and
       * move LSSTOP'd LWPs to LSSLEEP or make them runnable.
       */
      void
      proc_unstop(struct proc *p)
      {
              struct lwp *l;
              int sig;
      
              KASSERT(mutex_owned(proc_lock));
              KASSERT(mutex_owned(p->p_lock));
      
              p->p_stat = SACTIVE;
              p->p_sflag &= ~PS_STOPPING;
              sig = p->p_xsig;
      
              if (!p->p_waited)
                      p->p_pptr->p_nstopchild--;
      
              LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                      lwp_lock(l);
                      if (l->l_stat != LSSTOP) {
                              lwp_unlock(l);
                              continue;
                      }
                      if (l->l_wchan == NULL) {
                              setrunnable(l);
                              continue;
                      }
                      if (sig && (l->l_flag & LW_SINTR) != 0) {
                              setrunnable(l);
                              sig = 0;
                      } else {
                              l->l_stat = LSSLEEP;
                              p->p_nrlwps++;
                              lwp_unlock(l);
                      }
              }
      }
      
      void
      proc_stoptrace(int trapno, int sysnum, const register_t args[],
                     const register_t *ret, int error)
      {
              struct lwp *l = curlwp;
              struct proc *p = l->l_proc;
              struct sigacts *ps;
              sigset_t *mask;
              sig_t action;
              ksiginfo_t ksi;
              size_t i, sy_narg;
              const int signo = SIGTRAP;
      
              KASSERT((trapno == TRAP_SCE) || (trapno == TRAP_SCX));
              KASSERT(p->p_pptr != initproc);
              KASSERT(ISSET(p->p_slflag, PSL_TRACED));
              KASSERT(ISSET(p->p_slflag, PSL_SYSCALL));
      
              sy_narg = p->p_emul->e_sysent[sysnum].sy_narg;
      
              KSI_INIT_TRAP(&ksi);
              ksi.ksi_lid = l->l_lid;
              ksi.ksi_signo = signo;
              ksi.ksi_code = trapno;
      
              ksi.ksi_sysnum = sysnum;
              if (trapno == TRAP_SCE) {
                      ksi.ksi_retval[0] = 0;
                      ksi.ksi_retval[1] = 0;
                      ksi.ksi_error = 0;
              } else {
                      ksi.ksi_retval[0] = ret[0];
                      ksi.ksi_retval[1] = ret[1];
                      ksi.ksi_error = error;
              }
      
              memset(ksi.ksi_args, 0, sizeof(ksi.ksi_args));
      
              for (i = 0; i < sy_narg; i++)
                      ksi.ksi_args[i] = args[i];
      
              mutex_enter(p->p_lock);
      
              /*
               * If we are exiting, demise now.
               *
               * This avoids notifying tracer and deadlocking.
               */
              if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
                      mutex_exit(p->p_lock);
                      lwp_exit(l);
                      panic("proc_stoptrace");
                      /* NOTREACHED */
              }
      
              /*
               * If there's a pending SIGKILL process it immediately.
               */
              if (p->p_xsig == SIGKILL ||
                  sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
                      mutex_exit(p->p_lock);
                      return;
              }
      
              /* Needed for ktrace */
              ps = p->p_sigacts;
              action = SIGACTION_PS(ps, signo).sa_handler;
              mask = &l->l_sigmask;
      
              p->p_xsig = signo;
              p->p_sigctx.ps_lwp = ksi.ksi_lid;
              p->p_sigctx.ps_info = ksi.ksi_info;
              sigswitch(0, signo, true);
      
              if (ktrpoint(KTR_PSIG)) {
                      if (p->p_emul->e_ktrpsig)
                              p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
                      else
                              ktrpsig(signo, action, mask, &ksi);
              }
      }
      
      static int
      filt_sigattach(struct knote *kn)
      {
              struct proc *p = curproc;
      
              kn->kn_obj = p;
              kn->kn_flags |= EV_CLEAR;        /* automatically set */
      
              mutex_enter(p->p_lock);
              SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
              mutex_exit(p->p_lock);
      
              return 0;
      }
      
      static void
      filt_sigdetach(struct knote *kn)
      {
              struct proc *p = kn->kn_obj;
      
              mutex_enter(p->p_lock);
              SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
              mutex_exit(p->p_lock);
      }
      
      /*
       * Signal knotes are shared with proc knotes, so we apply a mask to
       * the hint in order to differentiate them from process hints.  This
       * could be avoided by using a signal-specific knote list, but probably
       * isn't worth the trouble.
       */
      static int
      filt_signal(struct knote *kn, long hint)
      {
      
              if (hint & NOTE_SIGNAL) {
                      hint &= ~NOTE_SIGNAL;
      
                      if (kn->kn_id == hint)
                              kn->kn_data++;
              }
              return (kn->kn_data != 0);
      }
      
      const struct filterops sig_filtops = {
                      .f_isfd = 0,
                      .f_attach = filt_sigattach,
                      .f_detach = filt_sigdetach,
                      .f_event = filt_signal,
      };
      /*        $NetBSD: subr_vmem.c,v 1.97 2018/02/08 09:05:20 dholland Exp $        */
      
      /*-
       * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      /*
       * reference:
       * -        Magazines and Vmem: Extending the Slab Allocator
       *        to Many CPUs and Arbitrary Resources
       *        http://www.usenix.org/event/usenix01/bonwick.html
       *
       * locking & the boundary tag pool:
       * -         A pool(9) is used for vmem boundary tags
       * -         During a pool get call the global vmem_btag_refill_lock is taken,
       *        to serialize access to the allocation reserve, but no other
       *        vmem arena locks.
       * -        During pool_put calls no vmem mutexes are locked.
       * -         pool_drain doesn't hold the pool's mutex while releasing memory to
       *         its backing therefore no interferance with any vmem mutexes.
       * -        The boundary tag pool is forced to put page headers into pool pages
       *          (PR_PHINPAGE) and not off page to avoid pool recursion.
       *          (due to sizeof(bt_t) it should be the case anyway)
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: subr_vmem.c,v 1.97 2018/02/08 09:05:20 dholland Exp $");
      
      #if defined(_KERNEL) && defined(_KERNEL_OPT)
      #include "opt_ddb.h"
      #endif /* defined(_KERNEL) && defined(_KERNEL_OPT) */
      
      #include <sys/param.h>
      #include <sys/hash.h>
      #include <sys/queue.h>
      #include <sys/bitops.h>
      
      #if defined(_KERNEL)
      #include <sys/systm.h>
      #include <sys/kernel.h>        /* hz */
      #include <sys/callout.h>
      #include <sys/kmem.h>
      #include <sys/pool.h>
      #include <sys/vmem.h>
      #include <sys/vmem_impl.h>
      #include <sys/workqueue.h>
      #include <sys/atomic.h>
      #include <uvm/uvm.h>
      #include <uvm/uvm_extern.h>
      #include <uvm/uvm_km.h>
      #include <uvm/uvm_page.h>
      #include <uvm/uvm_pdaemon.h>
      #else /* defined(_KERNEL) */
      #include <stdio.h>
      #include <errno.h>
      #include <assert.h>
      #include <stdlib.h>
      #include <string.h>
      #include "../sys/vmem.h"
      #include "../sys/vmem_impl.h"
      #endif /* defined(_KERNEL) */
      
      
      #if defined(_KERNEL)
      #include <sys/evcnt.h>
      #define VMEM_EVCNT_DEFINE(name) \
      struct evcnt vmem_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
          "vmem", #name); \
      EVCNT_ATTACH_STATIC(vmem_evcnt_##name);
      #define VMEM_EVCNT_INCR(ev)        vmem_evcnt_##ev.ev_count++
      #define VMEM_EVCNT_DECR(ev)        vmem_evcnt_##ev.ev_count--
      
      VMEM_EVCNT_DEFINE(static_bt_count)
      VMEM_EVCNT_DEFINE(static_bt_inuse)
      
      #define        VMEM_CONDVAR_INIT(vm, wchan)        cv_init(&vm->vm_cv, wchan)
      #define        VMEM_CONDVAR_DESTROY(vm)        cv_destroy(&vm->vm_cv)
      #define        VMEM_CONDVAR_WAIT(vm)                cv_wait(&vm->vm_cv, &vm->vm_lock)
      #define        VMEM_CONDVAR_BROADCAST(vm)        cv_broadcast(&vm->vm_cv)
      
      #else /* defined(_KERNEL) */
      
      #define VMEM_EVCNT_INCR(ev)        /* nothing */
      #define VMEM_EVCNT_DECR(ev)        /* nothing */
      
      #define        VMEM_CONDVAR_INIT(vm, wchan)        /* nothing */
      #define        VMEM_CONDVAR_DESTROY(vm)        /* nothing */
      #define        VMEM_CONDVAR_WAIT(vm)                /* nothing */
      #define        VMEM_CONDVAR_BROADCAST(vm)        /* nothing */
      
      #define        UNITTEST
      #define        KASSERT(a)                assert(a)
      #define        mutex_init(a, b, c)        /* nothing */
      #define        mutex_destroy(a)        /* nothing */
      #define        mutex_enter(a)                /* nothing */
      #define        mutex_tryenter(a)        true
      #define        mutex_exit(a)                /* nothing */
      #define        mutex_owned(a)                /* nothing */
      #define        ASSERT_SLEEPABLE()        /* nothing */
      #define        panic(...)                printf(__VA_ARGS__); abort()
      #endif /* defined(_KERNEL) */
      
      #if defined(VMEM_SANITY)
      static void vmem_check(vmem_t *);
      #else /* defined(VMEM_SANITY) */
      #define vmem_check(vm)        /* nothing */
      #endif /* defined(VMEM_SANITY) */
      
      #define        VMEM_HASHSIZE_MIN        1        /* XXX */
      #define        VMEM_HASHSIZE_MAX        65536        /* XXX */
      #define        VMEM_HASHSIZE_INIT        1
      
      #define        VM_FITMASK        (VM_BESTFIT | VM_INSTANTFIT)
      
      #if defined(_KERNEL)
      static bool vmem_bootstrapped = false;
      static kmutex_t vmem_list_lock;
      static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
      #endif /* defined(_KERNEL) */
      
      /* ---- misc */
      
      #define        VMEM_LOCK(vm)                mutex_enter(&vm->vm_lock)
      #define        VMEM_TRYLOCK(vm)        mutex_tryenter(&vm->vm_lock)
      #define        VMEM_UNLOCK(vm)                mutex_exit(&vm->vm_lock)
      #define        VMEM_LOCK_INIT(vm, ipl)        mutex_init(&vm->vm_lock, MUTEX_DEFAULT, ipl)
      #define        VMEM_LOCK_DESTROY(vm)        mutex_destroy(&vm->vm_lock)
      #define        VMEM_ASSERT_LOCKED(vm)        KASSERT(mutex_owned(&vm->vm_lock))
      
      #define        VMEM_ALIGNUP(addr, align) \
              (-(-(addr) & -(align)))
      
      #define        VMEM_CROSS_P(addr1, addr2, boundary) \
              ((((addr1) ^ (addr2)) & -(boundary)) != 0)
      
      #define        ORDER2SIZE(order)        ((vmem_size_t)1 << (order))
      #define        SIZE2ORDER(size)        ((int)ilog2(size))
      
      #if !defined(_KERNEL)
      #define        xmalloc(sz, flags)        malloc(sz)
      #define        xfree(p, sz)                free(p)
      #define        bt_alloc(vm, flags)        malloc(sizeof(bt_t))
      #define        bt_free(vm, bt)                free(bt)
      #else /* defined(_KERNEL) */
      
      #define        xmalloc(sz, flags) \
          kmem_alloc(sz, ((flags) & VM_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
      #define        xfree(p, sz)                kmem_free(p, sz);
      
      /*
       * BT_RESERVE calculation:
       * we allocate memory for boundry tags with vmem; therefore we have
       * to keep a reserve of bts used to allocated memory for bts. 
       * This reserve is 4 for each arena involved in allocating vmems memory.
       * BT_MAXFREE: don't cache excessive counts of bts in arenas
       */
      #define STATIC_BT_COUNT 200
      #define BT_MINRESERVE 4
      #define BT_MAXFREE 64
      
      static struct vmem_btag static_bts[STATIC_BT_COUNT];
      static int static_bt_count = STATIC_BT_COUNT;
      
      static struct vmem kmem_va_meta_arena_store;
      vmem_t *kmem_va_meta_arena;
      static struct vmem kmem_meta_arena_store;
      vmem_t *kmem_meta_arena = NULL;
      
      static kmutex_t vmem_btag_refill_lock;
      static kmutex_t vmem_btag_lock;
      static LIST_HEAD(, vmem_btag) vmem_btag_freelist;
      static size_t vmem_btag_freelist_count = 0;
      static struct pool vmem_btag_pool;
      
      static void
      vmem_kick_pdaemon(void)
      {
      #if defined(_KERNEL)
              mutex_spin_enter(&uvm_fpageqlock);
              uvm_kick_pdaemon();
              mutex_spin_exit(&uvm_fpageqlock);
      #endif
      }
      
      /* ---- boundary tag */
      
      static int bt_refill(vmem_t *vm);
      
      static void *
      pool_page_alloc_vmem_meta(struct pool *pp, int flags)
      {
              const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
              vmem_addr_t va;
              int ret;
      
              ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
                  (vflags & ~VM_FITMASK) | VM_INSTANTFIT | VM_POPULATING, &va);
      
              return ret ? NULL : (void *)va;
      }
      
      static void
      pool_page_free_vmem_meta(struct pool *pp, void *v)
      {
      
              vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
      }
      
      /* allocator for vmem-pool metadata */
      struct pool_allocator pool_allocator_vmem_meta = {
              .pa_alloc = pool_page_alloc_vmem_meta,
              .pa_free = pool_page_free_vmem_meta,
              .pa_pagesz = 0
      };
      
      static int
      bt_refill(vmem_t *vm)
      {
              bt_t *bt;
      
              VMEM_LOCK(vm);
              if (vm->vm_nfreetags > BT_MINRESERVE) {
                      VMEM_UNLOCK(vm);
                      return 0;
              }
      
              mutex_enter(&vmem_btag_lock);
              while (!LIST_EMPTY(&vmem_btag_freelist) &&
                  vm->vm_nfreetags <= BT_MINRESERVE) {
                      bt = LIST_FIRST(&vmem_btag_freelist);
                      LIST_REMOVE(bt, bt_freelist);
                      LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
                      vm->vm_nfreetags++;
                      vmem_btag_freelist_count--;
                      VMEM_EVCNT_INCR(static_bt_inuse);
              }
              mutex_exit(&vmem_btag_lock);
      
              while (vm->vm_nfreetags <= BT_MINRESERVE) {
                      VMEM_UNLOCK(vm);
                      mutex_enter(&vmem_btag_refill_lock);
                      bt = pool_get(&vmem_btag_pool, PR_NOWAIT);
                      mutex_exit(&vmem_btag_refill_lock);
                      VMEM_LOCK(vm);
                      if (bt == NULL)
                              break;
                      LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
                      vm->vm_nfreetags++;
              }
      
              if (vm->vm_nfreetags <= BT_MINRESERVE) {
                      VMEM_UNLOCK(vm);
                      return ENOMEM;
              }
      
              VMEM_UNLOCK(vm);
      
              if (kmem_meta_arena != NULL) {
                      (void)bt_refill(kmem_arena);
                      (void)bt_refill(kmem_va_meta_arena);
                      (void)bt_refill(kmem_meta_arena);
              }
      
              return 0;
      }
      
      static bt_t *
      bt_alloc(vmem_t *vm, vm_flag_t flags)
      {
              bt_t *bt;
              VMEM_LOCK(vm);
              while (vm->vm_nfreetags <= BT_MINRESERVE && (flags & VM_POPULATING) == 0) {
                      VMEM_UNLOCK(vm);
                      if (bt_refill(vm)) {
                              if ((flags & VM_NOSLEEP) != 0) {
                                      return NULL;
                              }
      
                              /*
                               * It would be nice to wait for something specific here
                               * but there are multiple ways that a retry could
                               * succeed and we can't wait for multiple things
                               * simultaneously.  So we'll just sleep for an arbitrary
                               * short period of time and retry regardless.
                               * This should be a very rare case.
                               */
      
                              vmem_kick_pdaemon();
                              kpause("btalloc", false, 1, NULL);
                      }
                      VMEM_LOCK(vm);
              }
              bt = LIST_FIRST(&vm->vm_freetags);
              LIST_REMOVE(bt, bt_freelist);
              vm->vm_nfreetags--;
              VMEM_UNLOCK(vm);
      
              return bt;
      }
      
      static void
      bt_free(vmem_t *vm, bt_t *bt)
      {
      
              VMEM_LOCK(vm);
              LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
              vm->vm_nfreetags++;
              VMEM_UNLOCK(vm);
      }
      
      static void
      bt_freetrim(vmem_t *vm, int freelimit)
      {
              bt_t *t;
              LIST_HEAD(, vmem_btag) tofree;
      
              LIST_INIT(&tofree);
      
              VMEM_LOCK(vm);
              while (vm->vm_nfreetags > freelimit) {
                      bt_t *bt = LIST_FIRST(&vm->vm_freetags);
                      LIST_REMOVE(bt, bt_freelist);
                      vm->vm_nfreetags--;
                      if (bt >= static_bts
                          && bt < &static_bts[STATIC_BT_COUNT]) {
                              mutex_enter(&vmem_btag_lock);
                              LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist);
                              vmem_btag_freelist_count++;
                              mutex_exit(&vmem_btag_lock);
                              VMEM_EVCNT_DECR(static_bt_inuse);
                      } else {
                              LIST_INSERT_HEAD(&tofree, bt, bt_freelist);
                      }
              }
      
              VMEM_UNLOCK(vm);
              while (!LIST_EMPTY(&tofree)) {
                      t = LIST_FIRST(&tofree);
                      LIST_REMOVE(t, bt_freelist);
                      pool_put(&vmem_btag_pool, t);
              }
      }
      #endif        /* defined(_KERNEL) */
      
      /*
       * freelist[0] ... [1, 1]
       * freelist[1] ... [2, 3]
       * freelist[2] ... [4, 7]
       * freelist[3] ... [8, 15]
       *  :
       * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1]
       *  :
       */
      
      static struct vmem_freelist *
      bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
      {
              const vmem_size_t qsize = size >> vm->vm_quantum_shift;
              const int idx = SIZE2ORDER(qsize);
      
              KASSERT(size != 0 && qsize != 0);
              KASSERT((size & vm->vm_quantum_mask) == 0);
              KASSERT(idx >= 0);
              KASSERT(idx < VMEM_MAXORDER);
      
              return &vm->vm_freelist[idx];
      }
      
      /*
       * bt_freehead_toalloc: return the freelist for the given size and allocation
       * strategy.
       *
       * for VM_INSTANTFIT, return the list in which any blocks are large enough
       * for the requested size.  otherwise, return the list which can have blocks
       * large enough for the requested size.
       */
      
      static struct vmem_freelist *
      bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, vm_flag_t strat)
      {
              const vmem_size_t qsize = size >> vm->vm_quantum_shift;
              int idx = SIZE2ORDER(qsize);
      
              KASSERT(size != 0 && qsize != 0);
              KASSERT((size & vm->vm_quantum_mask) == 0);
      
              if (strat == VM_INSTANTFIT && ORDER2SIZE(idx) != qsize) {
                      idx++;
                      /* check too large request? */
              }
              KASSERT(idx >= 0);
              KASSERT(idx < VMEM_MAXORDER);
      
              return &vm->vm_freelist[idx];
      }
      
      /* ---- boundary tag hash */
      
      static struct vmem_hashlist *
      bt_hashhead(vmem_t *vm, vmem_addr_t addr)
      {
              struct vmem_hashlist *list;
              unsigned int hash;
      
              hash = hash32_buf(&addr, sizeof(addr), HASH32_BUF_INIT);
              list = &vm->vm_hashlist[hash % vm->vm_hashsize];
      
              return list;
      }
      
      static bt_t *
      bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
      {
              struct vmem_hashlist *list;
              bt_t *bt;
      
              list = bt_hashhead(vm, addr);
              LIST_FOREACH(bt, list, bt_hashlist) {
                      if (bt->bt_start == addr) {
                              break;
                      }
              }
      
              return bt;
      }
      
      static void
      bt_rembusy(vmem_t *vm, bt_t *bt)
      {
      
              KASSERT(vm->vm_nbusytag > 0);
              vm->vm_inuse -= bt->bt_size;
              vm->vm_nbusytag--;
              LIST_REMOVE(bt, bt_hashlist);
      }
      
      static void
      bt_insbusy(vmem_t *vm, bt_t *bt)
      {
              struct vmem_hashlist *list;
      
              KASSERT(bt->bt_type == BT_TYPE_BUSY);
      
              list = bt_hashhead(vm, bt->bt_start);
              LIST_INSERT_HEAD(list, bt, bt_hashlist);
              vm->vm_nbusytag++;
              vm->vm_inuse += bt->bt_size;
      }
      
      /* ---- boundary tag list */
      
      static void
      bt_remseg(vmem_t *vm, bt_t *bt)
      {
      
              TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
      }
      
      static void
      bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
      {
      
              TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
      }
      
      static void
      bt_insseg_tail(vmem_t *vm, bt_t *bt)
      {
      
              TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
      }
      
      static void
      bt_remfree(vmem_t *vm, bt_t *bt)
      {
      
              KASSERT(bt->bt_type == BT_TYPE_FREE);
      
              LIST_REMOVE(bt, bt_freelist);
      }
      
      static void
      bt_insfree(vmem_t *vm, bt_t *bt)
      {
              struct vmem_freelist *list;
      
              list = bt_freehead_tofree(vm, bt->bt_size);
              LIST_INSERT_HEAD(list, bt, bt_freelist);
      }
      
      /* ---- vmem internal functions */
      
      #if defined(QCACHE)
      static inline vm_flag_t
      prf_to_vmf(int prflags)
      {
              vm_flag_t vmflags;
      
              KASSERT((prflags & ~(PR_LIMITFAIL | PR_WAITOK | PR_NOWAIT)) == 0);
              if ((prflags & PR_WAITOK) != 0) {
                      vmflags = VM_SLEEP;
              } else {
                      vmflags = VM_NOSLEEP;
              }
              return vmflags;
      }
      
      static inline int
      vmf_to_prf(vm_flag_t vmflags)
      {
              int prflags;
      
              if ((vmflags & VM_SLEEP) != 0) {
                      prflags = PR_WAITOK;
              } else {
                      prflags = PR_NOWAIT;
              }
              return prflags;
      }
      
      static size_t
      qc_poolpage_size(size_t qcache_max)
      {
              int i;
      
              for (i = 0; ORDER2SIZE(i) <= qcache_max * 3; i++) {
                      /* nothing */
              }
              return ORDER2SIZE(i);
      }
      
      static void *
      qc_poolpage_alloc(struct pool *pool, int prflags)
      {
              qcache_t *qc = QC_POOL_TO_QCACHE(pool);
              vmem_t *vm = qc->qc_vmem;
              vmem_addr_t addr;
      
              if (vmem_alloc(vm, pool->pr_alloc->pa_pagesz,
                  prf_to_vmf(prflags) | VM_INSTANTFIT, &addr) != 0)
                      return NULL;
              return (void *)addr;
      }
      
      static void
      qc_poolpage_free(struct pool *pool, void *addr)
      {
              qcache_t *qc = QC_POOL_TO_QCACHE(pool);
              vmem_t *vm = qc->qc_vmem;
      
              vmem_free(vm, (vmem_addr_t)addr, pool->pr_alloc->pa_pagesz);
      }
      
      static void
      qc_init(vmem_t *vm, size_t qcache_max, int ipl)
      {
              qcache_t *prevqc;
              struct pool_allocator *pa;
              int qcache_idx_max;
              int i;
      
              KASSERT((qcache_max & vm->vm_quantum_mask) == 0);
              if (qcache_max > (VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift)) {
                      qcache_max = VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift;
              }
              vm->vm_qcache_max = qcache_max;
              pa = &vm->vm_qcache_allocator;
              memset(pa, 0, sizeof(*pa));
              pa->pa_alloc = qc_poolpage_alloc;
              pa->pa_free = qc_poolpage_free;
              pa->pa_pagesz = qc_poolpage_size(qcache_max);
      
              qcache_idx_max = qcache_max >> vm->vm_quantum_shift;
              prevqc = NULL;
              for (i = qcache_idx_max; i > 0; i--) {
                      qcache_t *qc = &vm->vm_qcache_store[i - 1];
                      size_t size = i << vm->vm_quantum_shift;
                      pool_cache_t pc;
      
                      qc->qc_vmem = vm;
                      snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
                          vm->vm_name, size);
      
                      pc = pool_cache_init(size,
                          ORDER2SIZE(vm->vm_quantum_shift), 0,
                          PR_NOALIGN | PR_NOTOUCH | PR_RECURSIVE /* XXX */,
                          qc->qc_name, pa, ipl, NULL, NULL, NULL);
      
                      KASSERT(pc);
      
                      qc->qc_cache = pc;
                      KASSERT(qc->qc_cache != NULL);        /* XXX */
                      if (prevqc != NULL &&
                          qc->qc_cache->pc_pool.pr_itemsperpage ==
                          prevqc->qc_cache->pc_pool.pr_itemsperpage) {
                              pool_cache_destroy(qc->qc_cache);
                              vm->vm_qcache[i - 1] = prevqc;
                              continue;
                      }
                      qc->qc_cache->pc_pool.pr_qcache = qc;
                      vm->vm_qcache[i - 1] = qc;
                      prevqc = qc;
              }
      }
      
      static void
      qc_destroy(vmem_t *vm)
      {
              const qcache_t *prevqc;
              int i;
              int qcache_idx_max;
      
              qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
              prevqc = NULL;
              for (i = 0; i < qcache_idx_max; i++) {
                      qcache_t *qc = vm->vm_qcache[i];
      
                      if (prevqc == qc) {
                              continue;
                      }
                      pool_cache_destroy(qc->qc_cache);
                      prevqc = qc;
              }
      }
      #endif
      
      #if defined(_KERNEL)
      static void
      vmem_bootstrap(void)
      {
      
              mutex_init(&vmem_list_lock, MUTEX_DEFAULT, IPL_VM);
              mutex_init(&vmem_btag_lock, MUTEX_DEFAULT, IPL_VM);
              mutex_init(&vmem_btag_refill_lock, MUTEX_DEFAULT, IPL_VM);
      
              while (static_bt_count-- > 0) {
                      bt_t *bt = &static_bts[static_bt_count];
                      LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist);
                      VMEM_EVCNT_INCR(static_bt_count);
                      vmem_btag_freelist_count++;
              }
              vmem_bootstrapped = TRUE;
      }
      
      void
      vmem_subsystem_init(vmem_t *vm)
      {
      
              kmem_va_meta_arena = vmem_init(&kmem_va_meta_arena_store, "vmem-va",
                  0, 0, PAGE_SIZE, vmem_alloc, vmem_free, vm,
                  0, VM_NOSLEEP | VM_BOOTSTRAP | VM_LARGEIMPORT,
                  IPL_VM);
      
              kmem_meta_arena = vmem_init(&kmem_meta_arena_store, "vmem-meta",
                  0, 0, PAGE_SIZE,
                  uvm_km_kmem_alloc, uvm_km_kmem_free, kmem_va_meta_arena,
                  0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
      
              pool_init(&vmem_btag_pool, sizeof(bt_t), 0, 0, PR_PHINPAGE,
                          "vmembt", &pool_allocator_vmem_meta, IPL_VM);
      }
      #endif /* defined(_KERNEL) */
      
      static int
      vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags,
          int spanbttype)
      {
              bt_t *btspan;
              bt_t *btfree;
      
              KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
              KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
              KASSERT(spanbttype == BT_TYPE_SPAN ||
                  spanbttype == BT_TYPE_SPAN_STATIC);
      
              btspan = bt_alloc(vm, flags);
              if (btspan == NULL) {
                      return ENOMEM;
              }
              btfree = bt_alloc(vm, flags);
              if (btfree == NULL) {
                      bt_free(vm, btspan);
                      return ENOMEM;
              }
      
              btspan->bt_type = spanbttype;
              btspan->bt_start = addr;
              btspan->bt_size = size;
      
              btfree->bt_type = BT_TYPE_FREE;
              btfree->bt_start = addr;
              btfree->bt_size = size;
      
              VMEM_LOCK(vm);
              bt_insseg_tail(vm, btspan);
              bt_insseg(vm, btfree, btspan);
              bt_insfree(vm, btfree);
              vm->vm_size += size;
              VMEM_UNLOCK(vm);
      
              return 0;
      }
      
      static void
      vmem_destroy1(vmem_t *vm)
      {
      
      #if defined(QCACHE)
              qc_destroy(vm);
      #endif /* defined(QCACHE) */
              if (vm->vm_hashlist != NULL) {
                      int i;
      
                      for (i = 0; i < vm->vm_hashsize; i++) {
                              bt_t *bt;
      
                              while ((bt = LIST_FIRST(&vm->vm_hashlist[i])) != NULL) {
                                      KASSERT(bt->bt_type == BT_TYPE_SPAN_STATIC);
                                      bt_free(vm, bt);
                              }
                      }
                      if (vm->vm_hashlist != &vm->vm_hash0) {
                              xfree(vm->vm_hashlist,
                                  sizeof(struct vmem_hashlist *) * vm->vm_hashsize);
                      }
              }
      
              bt_freetrim(vm, 0);
      
              VMEM_CONDVAR_DESTROY(vm);
              VMEM_LOCK_DESTROY(vm);
              xfree(vm, sizeof(*vm));
      }
      
      static int
      vmem_import(vmem_t *vm, vmem_size_t size, vm_flag_t flags)
      {
              vmem_addr_t addr;
              int rc;
      
              if (vm->vm_importfn == NULL) {
                      return EINVAL;
              }
      
              if (vm->vm_flags & VM_LARGEIMPORT) {
                      size *= 16;
              }
      
              if (vm->vm_flags & VM_XIMPORT) {
                      rc = ((vmem_ximport_t *)vm->vm_importfn)(vm->vm_arg, size,
                          &size, flags, &addr);
              } else {
                      rc = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
              }
              if (rc) {
                      return ENOMEM;
              }
      
              if (vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN) != 0) {
                      (*vm->vm_releasefn)(vm->vm_arg, addr, size);
                      return ENOMEM;
              }
      
              return 0;
      }
      
      static int
      vmem_rehash(vmem_t *vm, size_t newhashsize, vm_flag_t flags)
      {
              bt_t *bt;
              int i;
              struct vmem_hashlist *newhashlist;
              struct vmem_hashlist *oldhashlist;
              size_t oldhashsize;
      
              KASSERT(newhashsize > 0);
      
              newhashlist =
                  xmalloc(sizeof(struct vmem_hashlist *) * newhashsize, flags);
              if (newhashlist == NULL) {
                      return ENOMEM;
              }
              for (i = 0; i < newhashsize; i++) {
                      LIST_INIT(&newhashlist[i]);
              }
      
              if (!VMEM_TRYLOCK(vm)) {
                      xfree(newhashlist,
                          sizeof(struct vmem_hashlist *) * newhashsize);
                      return EBUSY;
              }
              oldhashlist = vm->vm_hashlist;
              oldhashsize = vm->vm_hashsize;
              vm->vm_hashlist = newhashlist;
              vm->vm_hashsize = newhashsize;
              if (oldhashlist == NULL) {
                      VMEM_UNLOCK(vm);
                      return 0;
              }
              for (i = 0; i < oldhashsize; i++) {
                      while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
                              bt_rembusy(vm, bt); /* XXX */
                              bt_insbusy(vm, bt);
                      }
              }
              VMEM_UNLOCK(vm);
      
              if (oldhashlist != &vm->vm_hash0) {
                      xfree(oldhashlist,
                          sizeof(struct vmem_hashlist *) * oldhashsize);
              }
      
              return 0;
      }
      
      /*
       * vmem_fit: check if a bt can satisfy the given restrictions.
       *
       * it's a caller's responsibility to ensure the region is big enough
       * before calling us.
       */
      
      static int
      vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
          vmem_size_t phase, vmem_size_t nocross,
          vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp)
      {
              vmem_addr_t start;
              vmem_addr_t end;
      
              KASSERT(size > 0);
              KASSERT(bt->bt_size >= size); /* caller's responsibility */
      
              /*
               * XXX assumption: vmem_addr_t and vmem_size_t are
               * unsigned integer of the same size.
               */
      
              start = bt->bt_start;
              if (start < minaddr) {
                      start = minaddr;
              }
              end = BT_END(bt);
              if (end > maxaddr) {
                      end = maxaddr;
              }
              if (start > end) {
                      return ENOMEM;
              }
      
              start = VMEM_ALIGNUP(start - phase, align) + phase;
              if (start < bt->bt_start) {
                      start += align;
              }
              if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
                      KASSERT(align < nocross);
                      start = VMEM_ALIGNUP(start - phase, nocross) + phase;
              }
              if (start <= end && end - start >= size - 1) {
                      KASSERT((start & (align - 1)) == phase);
                      KASSERT(!VMEM_CROSS_P(start, start + size - 1, nocross));
                      KASSERT(minaddr <= start);
                      KASSERT(maxaddr == 0 || start + size - 1 <= maxaddr);
                      KASSERT(bt->bt_start <= start);
                      KASSERT(BT_END(bt) - start >= size - 1);
                      *addrp = start;
                      return 0;
              }
              return ENOMEM;
      }
      
      /* ---- vmem API */
      
      /*
       * vmem_create_internal: creates a vmem arena.
       */
      
      vmem_t *
      vmem_init(vmem_t *vm, const char *name,
          vmem_addr_t base, vmem_size_t size, vmem_size_t quantum,
          vmem_import_t *importfn, vmem_release_t *releasefn,
          vmem_t *arg, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
      {
              int i;
      
              KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
              KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
              KASSERT(quantum > 0);
      
      #if defined(_KERNEL)
              /* XXX: SMP, we get called early... */
              if (!vmem_bootstrapped) {
                      vmem_bootstrap();
              }
      #endif /* defined(_KERNEL) */
      
              if (vm == NULL) {
                      vm = xmalloc(sizeof(*vm), flags);
              }
              if (vm == NULL) {
                      return NULL;
              }
      
              VMEM_CONDVAR_INIT(vm, "vmem");
              VMEM_LOCK_INIT(vm, ipl);
              vm->vm_flags = flags;
              vm->vm_nfreetags = 0;
              LIST_INIT(&vm->vm_freetags);
              strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
              vm->vm_quantum_mask = quantum - 1;
              vm->vm_quantum_shift = SIZE2ORDER(quantum);
              KASSERT(ORDER2SIZE(vm->vm_quantum_shift) == quantum);
              vm->vm_importfn = importfn;
              vm->vm_releasefn = releasefn;
              vm->vm_arg = arg;
              vm->vm_nbusytag = 0;
              vm->vm_size = 0;
              vm->vm_inuse = 0;
      #if defined(QCACHE)
              qc_init(vm, qcache_max, ipl);
      #endif /* defined(QCACHE) */
      
              TAILQ_INIT(&vm->vm_seglist);
              for (i = 0; i < VMEM_MAXORDER; i++) {
                      LIST_INIT(&vm->vm_freelist[i]);
              }
              memset(&vm->vm_hash0, 0, sizeof(struct vmem_hashlist));
              vm->vm_hashsize = 1;
              vm->vm_hashlist = &vm->vm_hash0;
      
              if (size != 0) {
                      if (vmem_add(vm, base, size, flags) != 0) {
                              vmem_destroy1(vm);
                              return NULL;
                      }
              }
      
      #if defined(_KERNEL)
              if (flags & VM_BOOTSTRAP) {
                      bt_refill(vm);
              }
      
              mutex_enter(&vmem_list_lock);
              LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
              mutex_exit(&vmem_list_lock);
      #endif /* defined(_KERNEL) */
      
              return vm;
      }
      
      
      
      /*
       * vmem_create: create an arena.
       *
       * => must not be called from interrupt context.
       */
      
      vmem_t *
      vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
          vmem_size_t quantum, vmem_import_t *importfn, vmem_release_t *releasefn,
          vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
      {
      
              KASSERT((flags & (VM_XIMPORT)) == 0);
      
              return vmem_init(NULL, name, base, size, quantum,
                  importfn, releasefn, source, qcache_max, flags, ipl);
      }
      
      /*
       * vmem_xcreate: create an arena takes alternative import func.
       *
       * => must not be called from interrupt context.
       */
      
      vmem_t *
      vmem_xcreate(const char *name, vmem_addr_t base, vmem_size_t size,
          vmem_size_t quantum, vmem_ximport_t *importfn, vmem_release_t *releasefn,
          vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
      {
      
              KASSERT((flags & (VM_XIMPORT)) == 0);
      
              return vmem_init(NULL, name, base, size, quantum,
                  (vmem_import_t *)importfn, releasefn, source,
                  qcache_max, flags | VM_XIMPORT, ipl);
      }
      
      void
      vmem_destroy(vmem_t *vm)
      {
      
      #if defined(_KERNEL)
              mutex_enter(&vmem_list_lock);
              LIST_REMOVE(vm, vm_alllist);
              mutex_exit(&vmem_list_lock);
      #endif /* defined(_KERNEL) */
      
              vmem_destroy1(vm);
      }
      
      vmem_size_t
      vmem_roundup_size(vmem_t *vm, vmem_size_t size)
      {
      
              return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
      }
      
      /*
       * vmem_alloc: allocate resource from the arena.
       */
      
      int
      vmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, vmem_addr_t *addrp)
      {
              const vm_flag_t strat __diagused = flags & VM_FITMASK;
              int error;
      
              KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
              KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
      
              KASSERT(size > 0);
              KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT);
              if ((flags & VM_SLEEP) != 0) {
                      ASSERT_SLEEPABLE();
              }
      
      #if defined(QCACHE)
              if (size <= vm->vm_qcache_max) {
                      void *p;
                      int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift;
                      qcache_t *qc = vm->vm_qcache[qidx - 1];
      
                      p = pool_cache_get(qc->qc_cache, vmf_to_prf(flags));
                      if (addrp != NULL)
                              *addrp = (vmem_addr_t)p;
                      error = (p == NULL) ? ENOMEM : 0;
                      goto out;
              }
      #endif /* defined(QCACHE) */
      
              error = vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
                  flags, addrp);
      out:
              KASSERT(error == 0 || (flags & VM_SLEEP) == 0);
              return error;
      }
      
      int
      vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
          const vmem_size_t phase, const vmem_size_t nocross,
          const vmem_addr_t minaddr, const vmem_addr_t maxaddr, const vm_flag_t flags,
          vmem_addr_t *addrp)
      {
              struct vmem_freelist *list;
              struct vmem_freelist *first;
              struct vmem_freelist *end;
              bt_t *bt;
              bt_t *btnew;
              bt_t *btnew2;
              const vmem_size_t size = vmem_roundup_size(vm, size0);
              vm_flag_t strat = flags & VM_FITMASK;
              vmem_addr_t start;
              int rc;
      
              KASSERT(size0 > 0);
              KASSERT(size > 0);
              KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT);
              if ((flags & VM_SLEEP) != 0) {
                      ASSERT_SLEEPABLE();
              }
              KASSERT((align & vm->vm_quantum_mask) == 0);
              KASSERT((align & (align - 1)) == 0);
              KASSERT((phase & vm->vm_quantum_mask) == 0);
              KASSERT((nocross & vm->vm_quantum_mask) == 0);
              KASSERT((nocross & (nocross - 1)) == 0);
              KASSERT((align == 0 && phase == 0) || phase < align);
              KASSERT(nocross == 0 || nocross >= size);
              KASSERT(minaddr <= maxaddr);
              KASSERT(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
      
              if (align == 0) {
                      align = vm->vm_quantum_mask + 1;
              }
      
              /*
               * allocate boundary tags before acquiring the vmem lock.
               */
              btnew = bt_alloc(vm, flags);
              if (btnew == NULL) {
                      return ENOMEM;
              }
              btnew2 = bt_alloc(vm, flags); /* XXX not necessary if no restrictions */
              if (btnew2 == NULL) {
                      bt_free(vm, btnew);
                      return ENOMEM;
              }
      
              /*
               * choose a free block from which we allocate.
               */
      retry_strat:
              first = bt_freehead_toalloc(vm, size, strat);
              end = &vm->vm_freelist[VMEM_MAXORDER];
      retry:
              bt = NULL;
              VMEM_LOCK(vm);
              vmem_check(vm);
              if (strat == VM_INSTANTFIT) {
                      /*
                       * just choose the first block which satisfies our restrictions.
                       *
                       * note that we don't need to check the size of the blocks
                       * because any blocks found on these list should be larger than
                       * the given size.
                       */
                      for (list = first; list < end; list++) {
                              bt = LIST_FIRST(list);
                              if (bt != NULL) {
                                      rc = vmem_fit(bt, size, align, phase,
                                          nocross, minaddr, maxaddr, &start);
                                      if (rc == 0) {
                                              goto gotit;
                                      }
                                      /*
                                       * don't bother to follow the bt_freelist link
                                       * here.  the list can be very long and we are
                                       * told to run fast.  blocks from the later free
                                       * lists are larger and have better chances to
                                       * satisfy our restrictions.
                                       */
                              }
                      }
              } else { /* VM_BESTFIT */
                      /*
                       * we assume that, for space efficiency, it's better to
                       * allocate from a smaller block.  thus we will start searching
                       * from the lower-order list than VM_INSTANTFIT.
                       * however, don't bother to find the smallest block in a free
                       * list because the list can be very long.  we can revisit it
                       * if/when it turns out to be a problem.
                       *
                       * note that the 'first' list can contain blocks smaller than
                       * the requested size.  thus we need to check bt_size.
                       */
                      for (list = first; list < end; list++) {
                              LIST_FOREACH(bt, list, bt_freelist) {
                                      if (bt->bt_size >= size) {
                                              rc = vmem_fit(bt, size, align, phase,
                                                  nocross, minaddr, maxaddr, &start);
                                              if (rc == 0) {
                                                      goto gotit;
                                              }
                                      }
                              }
                      }
              }
              VMEM_UNLOCK(vm);
      #if 1
              if (strat == VM_INSTANTFIT) {
                      strat = VM_BESTFIT;
                      goto retry_strat;
              }
      #endif
              if (align != vm->vm_quantum_mask + 1 || phase != 0 || nocross != 0) {
      
                      /*
                       * XXX should try to import a region large enough to
                       * satisfy restrictions?
                       */
      
                      goto fail;
              }
              /* XXX eeek, minaddr & maxaddr not respected */
              if (vmem_import(vm, size, flags) == 0) {
                      goto retry;
              }
              /* XXX */
      
              if ((flags & VM_SLEEP) != 0) {
                      vmem_kick_pdaemon();
                      VMEM_LOCK(vm);
                      VMEM_CONDVAR_WAIT(vm);
                      VMEM_UNLOCK(vm);
                      goto retry;
              }
      fail:
              bt_free(vm, btnew);
              bt_free(vm, btnew2);
              return ENOMEM;
      
      gotit:
              KASSERT(bt->bt_type == BT_TYPE_FREE);
              KASSERT(bt->bt_size >= size);
              bt_remfree(vm, bt);
              vmem_check(vm);
              if (bt->bt_start != start) {
                      btnew2->bt_type = BT_TYPE_FREE;
                      btnew2->bt_start = bt->bt_start;
                      btnew2->bt_size = start - bt->bt_start;
                      bt->bt_start = start;
                      bt->bt_size -= btnew2->bt_size;
                      bt_insfree(vm, btnew2);
                      bt_insseg(vm, btnew2, TAILQ_PREV(bt, vmem_seglist, bt_seglist));
                      btnew2 = NULL;
                      vmem_check(vm);
              }
              KASSERT(bt->bt_start == start);
              if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
                      /* split */
                      btnew->bt_type = BT_TYPE_BUSY;
                      btnew->bt_start = bt->bt_start;
                      btnew->bt_size = size;
                      bt->bt_start = bt->bt_start + size;
                      bt->bt_size -= size;
                      bt_insfree(vm, bt);
                      bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist));
                      bt_insbusy(vm, btnew);
                      vmem_check(vm);
                      VMEM_UNLOCK(vm);
              } else {
                      bt->bt_type = BT_TYPE_BUSY;
                      bt_insbusy(vm, bt);
                      vmem_check(vm);
                      VMEM_UNLOCK(vm);
                      bt_free(vm, btnew);
                      btnew = bt;
              }
              if (btnew2 != NULL) {
                      bt_free(vm, btnew2);
              }
              KASSERT(btnew->bt_size >= size);
              btnew->bt_type = BT_TYPE_BUSY;
      
              if (addrp != NULL)
                      *addrp = btnew->bt_start;
              return 0;
      }
      
      /*
       * vmem_free: free the resource to the arena.
       */
      
      void
      vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
      {
      
              KASSERT(size > 0);
      
      #if defined(QCACHE)
              if (size <= vm->vm_qcache_max) {
                      int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift;
                      qcache_t *qc = vm->vm_qcache[qidx - 1];
      
                      pool_cache_put(qc->qc_cache, (void *)addr);
                      return;
              }
      #endif /* defined(QCACHE) */
      
              vmem_xfree(vm, addr, size);
      }
      
      void
      vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
      {
              bt_t *bt;
              bt_t *t;
              LIST_HEAD(, vmem_btag) tofree;
      
              LIST_INIT(&tofree);
      
              KASSERT(size > 0);
      
              VMEM_LOCK(vm);
      
              bt = bt_lookupbusy(vm, addr);
              KASSERT(bt != NULL);
              KASSERT(bt->bt_start == addr);
              KASSERT(bt->bt_size == vmem_roundup_size(vm, size) ||
                  bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
              KASSERT(bt->bt_type == BT_TYPE_BUSY);
              bt_rembusy(vm, bt);
              bt->bt_type = BT_TYPE_FREE;
      
              /* coalesce */
              t = TAILQ_NEXT(bt, bt_seglist);
              if (t != NULL && t->bt_type == BT_TYPE_FREE) {
                      KASSERT(BT_END(bt) < t->bt_start);        /* YYY */
                      bt_remfree(vm, t);
                      bt_remseg(vm, t);
                      bt->bt_size += t->bt_size;
                      LIST_INSERT_HEAD(&tofree, t, bt_freelist);
              }
              t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
              if (t != NULL && t->bt_type == BT_TYPE_FREE) {
                      KASSERT(BT_END(t) < bt->bt_start);        /* YYY */
                      bt_remfree(vm, t);
                      bt_remseg(vm, t);
                      bt->bt_size += t->bt_size;
                      bt->bt_start = t->bt_start;
                      LIST_INSERT_HEAD(&tofree, t, bt_freelist);
              }
      
              t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
              KASSERT(t != NULL);
              KASSERT(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
              if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
                  t->bt_size == bt->bt_size) {
                      vmem_addr_t spanaddr;
                      vmem_size_t spansize;
      
                      KASSERT(t->bt_start == bt->bt_start);
                      spanaddr = bt->bt_start;
                      spansize = bt->bt_size;
                      bt_remseg(vm, bt);
                      LIST_INSERT_HEAD(&tofree, bt, bt_freelist);
                      bt_remseg(vm, t);
                      LIST_INSERT_HEAD(&tofree, t, bt_freelist);
                      vm->vm_size -= spansize;
                      VMEM_CONDVAR_BROADCAST(vm);
                      VMEM_UNLOCK(vm);
                      (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
              } else {
                      bt_insfree(vm, bt);
                      VMEM_CONDVAR_BROADCAST(vm);
                      VMEM_UNLOCK(vm);
              }
      
              while (!LIST_EMPTY(&tofree)) {
                      t = LIST_FIRST(&tofree);
                      LIST_REMOVE(t, bt_freelist);
                      bt_free(vm, t);
              }
      
              bt_freetrim(vm, BT_MAXFREE);
      }
      
      /*
       * vmem_add:
       *
       * => caller must ensure appropriate spl,
       *    if the arena can be accessed from interrupt context.
       */
      
      int
      vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags)
      {
      
              return vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN_STATIC);
      }
      
      /*
       * vmem_size: information about arenas size
       *
       * => return free/allocated size in arena
       */
      vmem_size_t
      vmem_size(vmem_t *vm, int typemask)
      {
      
    1         switch (typemask) {
              case VMEM_ALLOC:
                      return vm->vm_inuse;
              case VMEM_FREE:
    1                 return vm->vm_size - vm->vm_inuse;
              case VMEM_FREE|VMEM_ALLOC:
    1                 return vm->vm_size;
              default:
                      panic("vmem_size");
              }
      }
      
      /* ---- rehash */
      
      #if defined(_KERNEL)
      static struct callout vmem_rehash_ch;
      static int vmem_rehash_interval;
      static struct workqueue *vmem_rehash_wq;
      static struct work vmem_rehash_wk;
      
      static void
      vmem_rehash_all(struct work *wk, void *dummy)
      {
              vmem_t *vm;
      
              KASSERT(wk == &vmem_rehash_wk);
              mutex_enter(&vmem_list_lock);
              LIST_FOREACH(vm, &vmem_list, vm_alllist) {
                      size_t desired;
                      size_t current;
      
                      if (!VMEM_TRYLOCK(vm)) {
                              continue;
                      }
                      desired = vm->vm_nbusytag;
                      current = vm->vm_hashsize;
                      VMEM_UNLOCK(vm);
      
                      if (desired > VMEM_HASHSIZE_MAX) {
                              desired = VMEM_HASHSIZE_MAX;
                      } else if (desired < VMEM_HASHSIZE_MIN) {
                              desired = VMEM_HASHSIZE_MIN;
                      }
                      if (desired > current * 2 || desired * 2 < current) {
                              vmem_rehash(vm, desired, VM_NOSLEEP);
                      }
              }
              mutex_exit(&vmem_list_lock);
      
              callout_schedule(&vmem_rehash_ch, vmem_rehash_interval);
      }
      
      static void
      vmem_rehash_all_kick(void *dummy)
      {
      
              workqueue_enqueue(vmem_rehash_wq, &vmem_rehash_wk, NULL);
      }
      
      void
      vmem_rehash_start(void)
      {
              int error;
      
              error = workqueue_create(&vmem_rehash_wq, "vmem_rehash",
                  vmem_rehash_all, NULL, PRI_VM, IPL_SOFTCLOCK, WQ_MPSAFE);
              if (error) {
                      panic("%s: workqueue_create %d\n", __func__, error);
              }
              callout_init(&vmem_rehash_ch, CALLOUT_MPSAFE);
              callout_setfunc(&vmem_rehash_ch, vmem_rehash_all_kick, NULL);
      
              vmem_rehash_interval = hz * 10;
              callout_schedule(&vmem_rehash_ch, vmem_rehash_interval);
      }
      #endif /* defined(_KERNEL) */
      
      /* ---- debug */
      
      #if defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY)
      
      static void bt_dump(const bt_t *, void (*)(const char *, ...)
          __printflike(1, 2));
      
      static const char *
      bt_type_string(int type)
      {
              static const char * const table[] = {
                      [BT_TYPE_BUSY] = "busy",
                      [BT_TYPE_FREE] = "free",
                      [BT_TYPE_SPAN] = "span",
                      [BT_TYPE_SPAN_STATIC] = "static span",
              };
      
              if (type >= __arraycount(table)) {
                      return "BOGUS";
              }
              return table[type];
      }
      
      static void
      bt_dump(const bt_t *bt, void (*pr)(const char *, ...))
      {
      
              (*pr)("\t%p: %" PRIu64 ", %" PRIu64 ", %d(%s)\n",
                  bt, (uint64_t)bt->bt_start, (uint64_t)bt->bt_size,
                  bt->bt_type, bt_type_string(bt->bt_type));
      }
      
      static void
      vmem_dump(const vmem_t *vm , void (*pr)(const char *, ...) __printflike(1, 2))
      {
              const bt_t *bt;
              int i;
      
              (*pr)("vmem %p '%s'\n", vm, vm->vm_name);
              TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                      bt_dump(bt, pr);
              }
      
              for (i = 0; i < VMEM_MAXORDER; i++) {
                      const struct vmem_freelist *fl = &vm->vm_freelist[i];
      
                      if (LIST_EMPTY(fl)) {
                              continue;
                      }
      
                      (*pr)("freelist[%d]\n", i);
                      LIST_FOREACH(bt, fl, bt_freelist) {
                              bt_dump(bt, pr);
                      }
              }
      }
      
      #endif /* defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) */
      
      #if defined(DDB)
      static bt_t *
      vmem_whatis_lookup(vmem_t *vm, uintptr_t addr)
      {
              bt_t *bt;
      
              TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                      if (BT_ISSPAN_P(bt)) {
                              continue;
                      }
                      if (bt->bt_start <= addr && addr <= BT_END(bt)) {
                              return bt;
                      }
              }
      
              return NULL;
      }
      
      void
      vmem_whatis(uintptr_t addr, void (*pr)(const char *, ...))
      {
              vmem_t *vm;
      
              LIST_FOREACH(vm, &vmem_list, vm_alllist) {
                      bt_t *bt;
      
                      bt = vmem_whatis_lookup(vm, addr);
                      if (bt == NULL) {
                              continue;
                      }
                      (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
                          (void *)addr, (void *)bt->bt_start,
                          (size_t)(addr - bt->bt_start), vm->vm_name,
                          (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
              }
      }
      
      void
      vmem_printall(const char *modif, void (*pr)(const char *, ...))
      {
              const vmem_t *vm;
      
              LIST_FOREACH(vm, &vmem_list, vm_alllist) {
                      vmem_dump(vm, pr);
              }
      }
      
      void
      vmem_print(uintptr_t addr, const char *modif, void (*pr)(const char *, ...))
      {
              const vmem_t *vm = (const void *)addr;
      
              vmem_dump(vm, pr);
      }
      #endif /* defined(DDB) */
      
      #if defined(_KERNEL)
      #define vmem_printf printf
      #else
      #include <stdio.h>
      #include <stdarg.h>
      
      static void
      vmem_printf(const char *fmt, ...)
      {
              va_list ap;
              va_start(ap, fmt);
              vprintf(fmt, ap);
              va_end(ap);
      }
      #endif
      
      #if defined(VMEM_SANITY)
      
      static bool
      vmem_check_sanity(vmem_t *vm)
      {
              const bt_t *bt, *bt2;
      
              KASSERT(vm != NULL);
      
              TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                      if (bt->bt_start > BT_END(bt)) {
                              printf("corrupted tag\n");
                              bt_dump(bt, vmem_printf);
                              return false;
                      }
              }
              TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                      TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
                              if (bt == bt2) {
                                      continue;
                              }
                              if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
                                      continue;
                              }
                              if (bt->bt_start <= BT_END(bt2) &&
                                  bt2->bt_start <= BT_END(bt)) {
                                      printf("overwrapped tags\n");
                                      bt_dump(bt, vmem_printf);
                                      bt_dump(bt2, vmem_printf);
                                      return false;
                              }
                      }
              }
      
              return true;
      }
      
      static void
      vmem_check(vmem_t *vm)
      {
      
              if (!vmem_check_sanity(vm)) {
                      panic("insanity vmem %p", vm);
              }
      }
      
      #endif /* defined(VMEM_SANITY) */
      
      #if defined(UNITTEST)
      int
      main(void)
      {
              int rc;
              vmem_t *vm;
              vmem_addr_t p;
              struct reg {
                      vmem_addr_t p;
                      vmem_size_t sz;
                      bool x;
              } *reg = NULL;
              int nreg = 0;
              int nalloc = 0;
              int nfree = 0;
              vmem_size_t total = 0;
      #if 1
              vm_flag_t strat = VM_INSTANTFIT;
      #else
              vm_flag_t strat = VM_BESTFIT;
      #endif
      
              vm = vmem_create("test", 0, 0, 1, NULL, NULL, NULL, 0, VM_SLEEP,
      #ifdef _KERNEL
                  IPL_NONE
      #else
                  0
      #endif
                  );
              if (vm == NULL) {
                      printf("vmem_create\n");
                      exit(EXIT_FAILURE);
              }
              vmem_dump(vm, vmem_printf);
      
              rc = vmem_add(vm, 0, 50, VM_SLEEP);
              assert(rc == 0);
              rc = vmem_add(vm, 100, 200, VM_SLEEP);
              assert(rc == 0);
              rc = vmem_add(vm, 2000, 1, VM_SLEEP);
              assert(rc == 0);
              rc = vmem_add(vm, 40000, 65536, VM_SLEEP);
              assert(rc == 0);
              rc = vmem_add(vm, 10000, 10000, VM_SLEEP);
              assert(rc == 0);
              rc = vmem_add(vm, 500, 1000, VM_SLEEP);
              assert(rc == 0);
              rc = vmem_add(vm, 0xffffff00, 0x100, VM_SLEEP);
              assert(rc == 0);
              rc = vmem_xalloc(vm, 0x101, 0, 0, 0,
                  0xffffff00, 0xffffffff, strat|VM_SLEEP, &p);
              assert(rc != 0);
              rc = vmem_xalloc(vm, 50, 0, 0, 0, 0, 49, strat|VM_SLEEP, &p);
              assert(rc == 0 && p == 0);
              vmem_xfree(vm, p, 50);
              rc = vmem_xalloc(vm, 25, 0, 0, 0, 0, 24, strat|VM_SLEEP, &p);
              assert(rc == 0 && p == 0);
              rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
                  0xffffff01, 0xffffffff, strat|VM_SLEEP, &p);
              assert(rc != 0);
              rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
                  0xffffff00, 0xfffffffe, strat|VM_SLEEP, &p);
              assert(rc != 0);
              rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
                  0xffffff00, 0xffffffff, strat|VM_SLEEP, &p);
              assert(rc == 0);
              vmem_dump(vm, vmem_printf);
              for (;;) {
                      struct reg *r;
                      int t = rand() % 100;
      
                      if (t > 45) {
                              /* alloc */
                              vmem_size_t sz = rand() % 500 + 1;
                              bool x;
                              vmem_size_t align, phase, nocross;
                              vmem_addr_t minaddr, maxaddr;
      
                              if (t > 70) {
                                      x = true;
                                      /* XXX */
                                      align = 1 << (rand() % 15);
                                      phase = rand() % 65536;
                                      nocross = 1 << (rand() % 15);
                                      if (align <= phase) {
                                              phase = 0;
                                      }
                                      if (VMEM_CROSS_P(phase, phase + sz - 1,
                                          nocross)) {
                                              nocross = 0;
                                      }
                                      do {
                                              minaddr = rand() % 50000;
                                              maxaddr = rand() % 70000;
                                      } while (minaddr > maxaddr);
                                      printf("=== xalloc %" PRIu64
                                          " align=%" PRIu64 ", phase=%" PRIu64
                                          ", nocross=%" PRIu64 ", min=%" PRIu64
                                          ", max=%" PRIu64 "\n",
                                          (uint64_t)sz,
                                          (uint64_t)align,
                                          (uint64_t)phase,
                                          (uint64_t)nocross,
                                          (uint64_t)minaddr,
                                          (uint64_t)maxaddr);
                                      rc = vmem_xalloc(vm, sz, align, phase, nocross,
                                          minaddr, maxaddr, strat|VM_SLEEP, &p);
                              } else {
                                      x = false;
                                      printf("=== alloc %" PRIu64 "\n", (uint64_t)sz);
                                      rc = vmem_alloc(vm, sz, strat|VM_SLEEP, &p);
                              }
                              printf("-> %" PRIu64 "\n", (uint64_t)p);
                              vmem_dump(vm, vmem_printf);
                              if (rc != 0) {
                                      if (x) {
                                              continue;
                                      }
                                      break;
                              }
                              nreg++;
                              reg = realloc(reg, sizeof(*reg) * nreg);
                              r = &reg[nreg - 1];
                              r->p = p;
                              r->sz = sz;
                              r->x = x;
                              total += sz;
                              nalloc++;
                      } else if (nreg != 0) {
                              /* free */
                              r = &reg[rand() % nreg];
                              printf("=== free %" PRIu64 ", %" PRIu64 "\n",
                                  (uint64_t)r->p, (uint64_t)r->sz);
                              if (r->x) {
                                      vmem_xfree(vm, r->p, r->sz);
                              } else {
                                      vmem_free(vm, r->p, r->sz);
                              }
                              total -= r->sz;
                              vmem_dump(vm, vmem_printf);
                              *r = reg[nreg - 1];
                              nreg--;
                              nfree++;
                      }
                      printf("total=%" PRIu64 "\n", (uint64_t)total);
              }
              fprintf(stderr, "total=%" PRIu64 ", nalloc=%d, nfree=%d\n",
                  (uint64_t)total, nalloc, nfree);
              exit(EXIT_SUCCESS);
      }
      #endif /* defined(UNITTEST) */
      /*        $NetBSD: kern_lock.c,v 1.163 2019/05/09 05:00:31 ozaki-r Exp $        */
      
      /*-
       * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
       * NASA Ames Research Center, and by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.163 2019/05/09 05:00:31 ozaki-r Exp $");
      
      #include <sys/param.h>
      #include <sys/proc.h>
      #include <sys/lock.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/lockdebug.h>
      #include <sys/cpu.h>
      #include <sys/syslog.h>
      #include <sys/atomic.h>
      #include <sys/lwp.h>
      #include <sys/pserialize.h>
      
      #include <machine/lock.h>
      
      #include <dev/lockstat.h>
      
      #define        RETURN_ADDRESS        (uintptr_t)__builtin_return_address(0)
      
      bool        kernel_lock_dodebug;
      
      __cpu_simple_lock_t kernel_lock[CACHE_LINE_SIZE / sizeof(__cpu_simple_lock_t)]
          __cacheline_aligned;
      
      void
      assert_sleepable(void)
      {
              const char *reason;
              uint64_t pctr;
              bool idle;
      
    4         if (panicstr != NULL) {
                      return;
              }
      
    4         LOCKDEBUG_BARRIER(kernel_lock, 1);
      
              /*
               * Avoid disabling/re-enabling preemption here since this
               * routine may be called in delicate situations.
               */
              do {
    4                 pctr = lwp_pctr();
                      idle = CURCPU_IDLE_P();
              } while (pctr != lwp_pctr());
      
              reason = NULL;
    4         if (idle && !cold &&
                  kcpuset_isset(kcpuset_running, cpu_index(curcpu()))) {
                      reason = "idle";
              }
    4         if (cpu_intr_p()) {
                      reason = "interrupt";
              }
    4         if (cpu_softintr_p()) {
                      reason = "softint";
              }
    4         if (!pserialize_not_in_read_section()) {
                      reason = "pserialize";
              }
      
    4         if (reason) {
                      panic("%s: %s caller=%p", __func__, reason,
                          (void *)RETURN_ADDRESS);
              }
      }
      
      /*
       * Functions for manipulating the kernel_lock.  We put them here
       * so that they show up in profiles.
       */
      
      #define        _KERNEL_LOCK_ABORT(msg)                                                \
          LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg)
      
      #ifdef LOCKDEBUG
      #define        _KERNEL_LOCK_ASSERT(cond)                                        \
      do {                                                                        \
              if (!(cond))                                                        \
                      _KERNEL_LOCK_ABORT("assertion failed: " #cond);                \
      } while (/* CONSTCOND */ 0)
      #else
      #define        _KERNEL_LOCK_ASSERT(cond)        /* nothing */
      #endif
      
      static void        _kernel_lock_dump(const volatile void *, lockop_printer_t);
      
      lockops_t _kernel_lock_ops = {
              .lo_name = "Kernel lock",
              .lo_type = LOCKOPS_SPIN,
              .lo_dump = _kernel_lock_dump,
      };
      
      /*
       * Initialize the kernel lock.
       */
      void
      kernel_lock_init(void)
      {
      
              __cpu_simple_lock_init(kernel_lock);
              kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops,
                  RETURN_ADDRESS);
      }
      CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t));
      
      /*
       * Print debugging information about the kernel lock.
       */
      static void
      _kernel_lock_dump(const volatile void *junk, lockop_printer_t pr)
      {
              struct cpu_info *ci = curcpu();
      
              (void)junk;
      
              pr("curcpu holds : %18d wanted by: %#018lx\n",
                  ci->ci_biglock_count, (long)ci->ci_biglock_wanted);
      }
      
      /*
       * Acquire 'nlocks' holds on the kernel lock.
       */
      void
      _kernel_lock(int nlocks)
      {
              struct cpu_info *ci;
              LOCKSTAT_TIMER(spintime);
              LOCKSTAT_FLAG(lsflag);
              struct lwp *owant;
              u_int spins;
              int s;
              struct lwp *l = curlwp;
      
              _KERNEL_LOCK_ASSERT(nlocks > 0);
      
              s = splvm();
              ci = curcpu();
              if (ci->ci_biglock_count != 0) {
                      _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
                      ci->ci_biglock_count += nlocks;
                      l->l_blcnt += nlocks;
                      splx(s);
                      return;
              }
      
              _KERNEL_LOCK_ASSERT(l->l_blcnt == 0);
              LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS,
                  0);
      
              if (__cpu_simple_lock_try(kernel_lock)) {
                      ci->ci_biglock_count = nlocks;
                      l->l_blcnt = nlocks;
                      LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
                          RETURN_ADDRESS, 0);
                      splx(s);
                      return;
              }
      
              /*
               * To remove the ordering constraint between adaptive mutexes
               * and kernel_lock we must make it appear as if this thread is
               * blocking.  For non-interlocked mutex release, a store fence
               * is required to ensure that the result of any mutex_exit()
               * by the current LWP becomes visible on the bus before the set
               * of ci->ci_biglock_wanted becomes visible.
               */
              membar_producer();
              owant = ci->ci_biglock_wanted;
              ci->ci_biglock_wanted = l;
      
              /*
               * Spin until we acquire the lock.  Once we have it, record the
               * time spent with lockstat.
               */
              LOCKSTAT_ENTER(lsflag);
              LOCKSTAT_START_TIMER(lsflag, spintime);
      
              spins = 0;
              do {
                      splx(s);
                      while (__SIMPLELOCK_LOCKED_P(kernel_lock)) {
                              if (SPINLOCK_SPINOUT(spins)) {
                                      extern int start_init_exec;
                                      if (!start_init_exec)
                                              _KERNEL_LOCK_ABORT("spinout");
                              }
                              SPINLOCK_BACKOFF_HOOK;
                              SPINLOCK_SPIN_HOOK;
                      }
                      s = splvm();
              } while (!__cpu_simple_lock_try(kernel_lock));
      
              ci->ci_biglock_count = nlocks;
              l->l_blcnt = nlocks;
              LOCKSTAT_STOP_TIMER(lsflag, spintime);
              LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
                  RETURN_ADDRESS, 0);
              if (owant == NULL) {
                      LOCKSTAT_EVENT_RA(lsflag, kernel_lock,
                          LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS);
              }
              LOCKSTAT_EXIT(lsflag);
              splx(s);
      
              /*
               * Now that we have kernel_lock, reset ci_biglock_wanted.  This
               * store must be unbuffered (immediately visible on the bus) in
               * order for non-interlocked mutex release to work correctly.
               * It must be visible before a mutex_exit() can execute on this
               * processor.
               *
               * Note: only where CAS is available in hardware will this be
               * an unbuffered write, but non-interlocked release cannot be
               * done on CPUs without CAS in hardware.
               */
              (void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant);
      
              /*
               * Issue a memory barrier as we have acquired a lock.  This also
               * prevents stores from a following mutex_exit() being reordered
               * to occur before our store to ci_biglock_wanted above.
               */
              membar_enter();
      }
      
      /*
       * Release 'nlocks' holds on the kernel lock.  If 'nlocks' is zero, release
       * all holds.
       */
      void
      _kernel_unlock(int nlocks, int *countp)
      {
              struct cpu_info *ci;
              u_int olocks;
              int s;
              struct lwp *l = curlwp;
      
              _KERNEL_LOCK_ASSERT(nlocks < 2);
      
              olocks = l->l_blcnt;
      
              if (olocks == 0) {
                      _KERNEL_LOCK_ASSERT(nlocks <= 0);
                      if (countp != NULL)
                              *countp = 0;
                      return;
              }
      
              _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
      
              if (nlocks == 0)
                      nlocks = olocks;
              else if (nlocks == -1) {
                      nlocks = 1;
                      _KERNEL_LOCK_ASSERT(olocks == 1);
              }
              s = splvm();
              ci = curcpu();
              _KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt);
              if (ci->ci_biglock_count == nlocks) {
                      LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock,
                          RETURN_ADDRESS, 0);
                      ci->ci_biglock_count = 0;
                      __cpu_simple_unlock(kernel_lock);
                      l->l_blcnt -= nlocks;
                      splx(s);
                      if (l->l_dopreempt)
                              kpreempt(0);
              } else {
                      ci->ci_biglock_count -= nlocks;
                      l->l_blcnt -= nlocks;
                      splx(s);
              }
      
              if (countp != NULL)
                      *countp = olocks;
      }
      
      bool
      _kernel_locked_p(void)
      {
              return __SIMPLELOCK_LOCKED_P(kernel_lock);
      }
      /*        $NetBSD: sysv_shm.c,v 1.135 2019/06/10 00:35:47 chs Exp $        */
      
      /*-
       * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
       * NASA Ames Research Center, and by Mindaugas Rasiukevicius.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Copyright (c) 1994 Adam Glass and Charles M. Hannum.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. All advertising materials mentioning features or use of this software
       *    must display the following acknowledgement:
       *        This product includes software developed by Adam Glass and Charles M.
       *        Hannum.
       * 4. The names of the authors may not be used to endorse or promote products
       *    derived from this software without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: sysv_shm.c,v 1.135 2019/06/10 00:35:47 chs Exp $");
      
      #ifdef _KERNEL_OPT
      #include "opt_sysv.h"
      #endif
      
      #include <sys/param.h>
      #include <sys/kernel.h>
      #include <sys/kmem.h>
      #include <sys/shm.h>
      #include <sys/mutex.h>
      #include <sys/mman.h>
      #include <sys/stat.h>
      #include <sys/sysctl.h>
      #include <sys/mount.h>                /* XXX for <sys/syscallargs.h> */
      #include <sys/syscallargs.h>
      #include <sys/queue.h>
      #include <sys/kauth.h>
      
      #include <uvm/uvm_extern.h>
      #include <uvm/uvm_object.h>
      
      struct shmmap_entry {
              SLIST_ENTRY(shmmap_entry) next;
              vaddr_t va;
              int shmid;
      };
      
      int                        shm_nused                __cacheline_aligned;
      struct shmid_ds *        shmsegs                        __read_mostly;
      
      static kmutex_t                shm_lock                __cacheline_aligned;
      static kcondvar_t *        shm_cv                        __cacheline_aligned;
      static int                shm_last_free                __cacheline_aligned;
      static size_t                shm_committed                __cacheline_aligned;
      static int                shm_use_phys                __read_mostly;
      
      static kcondvar_t        shm_realloc_cv;
      static bool                shm_realloc_state;
      static u_int                shm_realloc_disable;
      
      struct shmmap_state {
              unsigned int nitems;
              unsigned int nrefs;
              SLIST_HEAD(, shmmap_entry) entries;
      };
      
      extern int kern_has_sysvshm;
      
      SYSCTL_SETUP_PROTO(sysctl_ipc_shm_setup);
      
      #ifdef SHMDEBUG
      #define SHMPRINTF(a) printf a
      #else
      #define SHMPRINTF(a)
      #endif
      
      static int shmrealloc(int);
      
      /*
       * Find the shared memory segment by the identifier.
       *  => must be called with shm_lock held;
       */
      static struct shmid_ds *
      shm_find_segment_by_shmid(int shmid)
      {
              int segnum;
              struct shmid_ds *shmseg;
      
              KASSERT(mutex_owned(&shm_lock));
      
              segnum = IPCID_TO_IX(shmid);
              if (segnum < 0 || segnum >= shminfo.shmmni)
                      return NULL;
              shmseg = &shmsegs[segnum];
              if ((shmseg->shm_perm.mode & SHMSEG_ALLOCATED) == 0)
                      return NULL;
              if ((shmseg->shm_perm.mode &
                  (SHMSEG_REMOVED|SHMSEG_RMLINGER)) == SHMSEG_REMOVED)
                      return NULL;
              if (shmseg->shm_perm._seq != IPCID_TO_SEQ(shmid))
                      return NULL;
      
              return shmseg;
      }
      
      /*
       * Free memory segment.
       *  => must be called with shm_lock held;
       */
      static void
      shm_free_segment(int segnum)
      {
              struct shmid_ds *shmseg;
              size_t size;
              bool wanted;
      
              KASSERT(mutex_owned(&shm_lock));
      
              shmseg = &shmsegs[segnum];
              SHMPRINTF(("shm freeing key 0x%lx seq 0x%x\n",
                  shmseg->shm_perm._key, shmseg->shm_perm._seq));
      
              size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
              wanted = (shmseg->shm_perm.mode & SHMSEG_WANTED);
      
              shmseg->_shm_internal = NULL;
              shm_committed -= btoc(size);
              shm_nused--;
              shmseg->shm_perm.mode = SHMSEG_FREE;
              shm_last_free = segnum;
              if (wanted == true)
                      cv_broadcast(&shm_cv[segnum]);
      }
      
      /*
       * Delete entry from the shm map.
       *  => must be called with shm_lock held;
       */
      static struct uvm_object *
      shm_delete_mapping(struct shmmap_state *shmmap_s,
          struct shmmap_entry *shmmap_se)
      {
              struct uvm_object *uobj = NULL;
              struct shmid_ds *shmseg;
              int segnum;
      
              KASSERT(mutex_owned(&shm_lock));
      
              segnum = IPCID_TO_IX(shmmap_se->shmid);
              shmseg = &shmsegs[segnum];
              SLIST_REMOVE(&shmmap_s->entries, shmmap_se, shmmap_entry, next);
              shmmap_s->nitems--;
              shmseg->shm_dtime = time_second;
              if ((--shmseg->shm_nattch <= 0) &&
                  (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
                      uobj = shmseg->_shm_internal;
                      shm_free_segment(segnum);
              }
      
              return uobj;
      }
      
      /*
       * Get a non-shared shm map for that vmspace.  Note, that memory
       * allocation might be performed with lock held.
       */
      static struct shmmap_state *
      shmmap_getprivate(struct proc *p)
      {
              struct shmmap_state *oshmmap_s, *shmmap_s;
              struct shmmap_entry *oshmmap_se, *shmmap_se;
      
              KASSERT(mutex_owned(&shm_lock));
      
              /* 1. A shm map with refcnt = 1, used by ourselves, thus return */
              oshmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
              if (oshmmap_s && oshmmap_s->nrefs == 1)
                      return oshmmap_s;
      
              /* 2. No shm map preset - create a fresh one */
              shmmap_s = kmem_zalloc(sizeof(struct shmmap_state), KM_SLEEP);
              shmmap_s->nrefs = 1;
              SLIST_INIT(&shmmap_s->entries);
              p->p_vmspace->vm_shm = (void *)shmmap_s;
      
              if (oshmmap_s == NULL)
                      return shmmap_s;
      
              SHMPRINTF(("shmmap_getprivate: vm %p split (%d entries), was used by %d\n",
                  p->p_vmspace, oshmmap_s->nitems, oshmmap_s->nrefs));
      
              /* 3. A shared shm map, copy to a fresh one and adjust refcounts */
              SLIST_FOREACH(oshmmap_se, &oshmmap_s->entries, next) {
                      shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP);
                      shmmap_se->va = oshmmap_se->va;
                      shmmap_se->shmid = oshmmap_se->shmid;
                      SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
              }
              shmmap_s->nitems = oshmmap_s->nitems;
              oshmmap_s->nrefs--;
      
              return shmmap_s;
      }
      
      /*
       * Lock/unlock the memory.
       *  => must be called with shm_lock held;
       */
      static int
      shm_memlock(struct shmid_ds *shmseg, int shmid, int cmd)
      {
              size_t size;
              int error;
      
              KASSERT(mutex_owned(&shm_lock));
      
              size = round_page(shmseg->shm_segsz);
      
              if (cmd == SHM_LOCK && (shmseg->shm_perm.mode & SHMSEG_WIRED) == 0) {
                      /* Wire the object and map, then tag it */
                      error = uvm_obj_wirepages(shmseg->_shm_internal,
                          0, size, NULL);
                      if (error)
                              return EIO;
                      shmseg->shm_perm.mode |= SHMSEG_WIRED;
      
              } else if (cmd == SHM_UNLOCK &&
                  (shmseg->shm_perm.mode & SHMSEG_WIRED) != 0) {
                      /* Unwire the object, then untag it */
                      uvm_obj_unwirepages(shmseg->_shm_internal, 0, size);
                      shmseg->shm_perm.mode &= ~SHMSEG_WIRED;
              }
      
              return 0;
      }
      
      /*
       * Unmap shared memory.
       */
      int
      sys_shmdt(struct lwp *l, const struct sys_shmdt_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(const void *) shmaddr;
              } */
              struct proc *p = l->l_proc;
              struct shmmap_state *shmmap_s1, *shmmap_s;
              struct shmmap_entry *shmmap_se;
              struct uvm_object *uobj;
              struct shmid_ds *shmseg;
              size_t size;
      
              mutex_enter(&shm_lock);
              /* In case of reallocation, we will wait for completion */
              while (__predict_false(shm_realloc_state))
                      cv_wait(&shm_realloc_cv, &shm_lock);
      
              shmmap_s1 = (struct shmmap_state *)p->p_vmspace->vm_shm;
              if (shmmap_s1 == NULL) {
                      mutex_exit(&shm_lock);
                      return EINVAL;
              }
      
              /* Find the map entry */
              SLIST_FOREACH(shmmap_se, &shmmap_s1->entries, next)
                      if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
                              break;
              if (shmmap_se == NULL) {
                      mutex_exit(&shm_lock);
                      return EINVAL;
              }
      
              shmmap_s = shmmap_getprivate(p);
              if (shmmap_s != shmmap_s1) {
                      /* Map has been copied, lookup entry in new map */
                      SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
                              if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
                                      break;
                      if (shmmap_se == NULL) {
                              mutex_exit(&shm_lock);
                              return EINVAL;
                      }
              }
      
              SHMPRINTF(("shmdt: vm %p: remove %d @%lx\n",
                  p->p_vmspace, shmmap_se->shmid, shmmap_se->va));
      
              /* Delete the entry from shm map */
              uobj = shm_delete_mapping(shmmap_s, shmmap_se);
              shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
              size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
              mutex_exit(&shm_lock);
      
              uvm_deallocate(&p->p_vmspace->vm_map, shmmap_se->va, size);
              if (uobj != NULL) {
                      uao_detach(uobj);
              }
              kmem_free(shmmap_se, sizeof(struct shmmap_entry));
      
              return 0;
      }
      
      /*
       * Map shared memory.
       */
      int
      sys_shmat(struct lwp *l, const struct sys_shmat_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(int) shmid;
                      syscallarg(const void *) shmaddr;
                      syscallarg(int) shmflg;
              } */
              int error, flags = 0;
              struct proc *p = l->l_proc;
              kauth_cred_t cred = l->l_cred;
              struct shmid_ds *shmseg;
              struct shmmap_state *shmmap_s;
              struct shmmap_entry *shmmap_se;
              struct uvm_object *uobj;
              struct vmspace *vm;
              vaddr_t attach_va;
              vm_prot_t prot;
              vsize_t size;
      
              /* Allocate a new map entry and set it */
              shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP);
              shmmap_se->shmid = SCARG(uap, shmid);
      
              mutex_enter(&shm_lock);
              /* In case of reallocation, we will wait for completion */
              while (__predict_false(shm_realloc_state))
                      cv_wait(&shm_realloc_cv, &shm_lock);
      
              shmseg = shm_find_segment_by_shmid(SCARG(uap, shmid));
              if (shmseg == NULL) {
                      error = EINVAL;
                      goto err;
              }
              error = ipcperm(cred, &shmseg->shm_perm,
                  (SCARG(uap, shmflg) & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
              if (error)
                      goto err;
      
              vm = p->p_vmspace;
              shmmap_s = (struct shmmap_state *)vm->vm_shm;
              if (shmmap_s && shmmap_s->nitems >= shminfo.shmseg) {
                      error = EMFILE;
                      goto err;
              }
      
              size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
              prot = VM_PROT_READ;
              if ((SCARG(uap, shmflg) & SHM_RDONLY) == 0)
                      prot |= VM_PROT_WRITE;
              if (SCARG(uap, shmaddr)) {
                      flags |= UVM_FLAG_FIXED;
                      if (SCARG(uap, shmflg) & SHM_RND)
                              attach_va =
                                  (vaddr_t)SCARG(uap, shmaddr) & ~(SHMLBA-1);
                      else if (((vaddr_t)SCARG(uap, shmaddr) & (SHMLBA-1)) == 0)
                              attach_va = (vaddr_t)SCARG(uap, shmaddr);
                      else {
                              error = EINVAL;
                              goto err;
                      }
              } else {
                      /* This is just a hint to uvm_map() about where to put it. */
                      attach_va = p->p_emul->e_vm_default_addr(p,
                          (vaddr_t)vm->vm_daddr, size,
                          p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
              }
      
              /*
               * Create a map entry, add it to the list and increase the counters.
               * The lock will be dropped before the mapping, disable reallocation.
               */
              shmmap_s = shmmap_getprivate(p);
              SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
              shmmap_s->nitems++;
              shmseg->shm_lpid = p->p_pid;
              shmseg->shm_nattch++;
              shm_realloc_disable++;
              mutex_exit(&shm_lock);
      
              /*
               * Add a reference to the memory object, map it to the
               * address space, and lock the memory, if needed.
               */
              uobj = shmseg->_shm_internal;
              uao_reference(uobj);
              error = uvm_map(&vm->vm_map, &attach_va, size, uobj, 0, 0,
                  UVM_MAPFLAG(prot, prot, UVM_INH_SHARE, UVM_ADV_RANDOM, flags));
              if (error)
                      goto err_detach;
      
              /* Set the new address, and update the time */
              mutex_enter(&shm_lock);
              shmmap_se->va = attach_va;
              shmseg->shm_atime = time_second;
              shm_realloc_disable--;
              retval[0] = attach_va;
              SHMPRINTF(("shmat: vm %p: add %d @%lx\n",
                  p->p_vmspace, shmmap_se->shmid, attach_va));
      err:
              cv_broadcast(&shm_realloc_cv);
              mutex_exit(&shm_lock);
              if (error && shmmap_se) {
                      kmem_free(shmmap_se, sizeof(struct shmmap_entry));
              }
              return error;
      
      err_detach:
              uao_detach(uobj);
              mutex_enter(&shm_lock);
              uobj = shm_delete_mapping(shmmap_s, shmmap_se);
              shm_realloc_disable--;
              cv_broadcast(&shm_realloc_cv);
              mutex_exit(&shm_lock);
              if (uobj != NULL) {
                      uao_detach(uobj);
              }
              kmem_free(shmmap_se, sizeof(struct shmmap_entry));
              return error;
      }
      
      /*
       * Shared memory control operations.
       */
      int
      sys___shmctl50(struct lwp *l, const struct sys___shmctl50_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(int) shmid;
                      syscallarg(int) cmd;
                      syscallarg(struct shmid_ds *) buf;
              } */
              struct shmid_ds shmbuf;
              int cmd, error;
      
              cmd = SCARG(uap, cmd);
              if (cmd == IPC_SET) {
                      error = copyin(SCARG(uap, buf), &shmbuf, sizeof(shmbuf));
                      if (error)
                              return error;
              }
      
              error = shmctl1(l, SCARG(uap, shmid), cmd,
                  (cmd == IPC_SET || cmd == IPC_STAT) ? &shmbuf : NULL);
      
              if (error == 0 && cmd == IPC_STAT)
                      error = copyout(&shmbuf, SCARG(uap, buf), sizeof(shmbuf));
      
              return error;
      }
      
      int
      shmctl1(struct lwp *l, int shmid, int cmd, struct shmid_ds *shmbuf)
      {
              struct uvm_object *uobj = NULL;
              kauth_cred_t cred = l->l_cred;
              struct shmid_ds *shmseg;
              int error = 0;
      
              mutex_enter(&shm_lock);
              /* In case of reallocation, we will wait for completion */
              while (__predict_false(shm_realloc_state))
                      cv_wait(&shm_realloc_cv, &shm_lock);
      
              shmseg = shm_find_segment_by_shmid(shmid);
              if (shmseg == NULL) {
                      mutex_exit(&shm_lock);
                      return EINVAL;
              }
      
              switch (cmd) {
              case IPC_STAT:
                      if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_R)) != 0)
                              break;
                      memset(shmbuf, 0, sizeof *shmbuf);
                      shmbuf->shm_perm = shmseg->shm_perm;
                      shmbuf->shm_perm.mode &= 0777;
                      shmbuf->shm_segsz = shmseg->shm_segsz;
                      shmbuf->shm_lpid = shmseg->shm_lpid;
                      shmbuf->shm_cpid = shmseg->shm_cpid;
                      shmbuf->shm_nattch = shmseg->shm_nattch;
                      shmbuf->shm_atime = shmseg->shm_atime;
                      shmbuf->shm_dtime = shmseg->shm_dtime;
                      shmbuf->shm_ctime = shmseg->shm_ctime;
                      break;
              case IPC_SET:
                      if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
                              break;
                      shmseg->shm_perm.uid = shmbuf->shm_perm.uid;
                      shmseg->shm_perm.gid = shmbuf->shm_perm.gid;
                      shmseg->shm_perm.mode =
                          (shmseg->shm_perm.mode & ~ACCESSPERMS) |
                          (shmbuf->shm_perm.mode & ACCESSPERMS);
                      shmseg->shm_ctime = time_second;
                      break;
              case IPC_RMID:
                      if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
                              break;
                      shmseg->shm_perm._key = IPC_PRIVATE;
                      shmseg->shm_perm.mode |= SHMSEG_REMOVED;
                      if (shmseg->shm_nattch <= 0) {
                              uobj = shmseg->_shm_internal;
                              shm_free_segment(IPCID_TO_IX(shmid));
                      }
                      break;
              case SHM_LOCK:
              case SHM_UNLOCK:
                      if ((error = kauth_authorize_system(cred,
                          KAUTH_SYSTEM_SYSVIPC,
                          (cmd == SHM_LOCK) ? KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK :
                          KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK, NULL, NULL, NULL)) != 0)
                              break;
                      error = shm_memlock(shmseg, shmid, cmd);
                      break;
              default:
                      error = EINVAL;
              }
      
              mutex_exit(&shm_lock);
              if (uobj != NULL)
                      uao_detach(uobj);
              return error;
      }
      
      /*
       * Try to take an already existing segment.
       *  => must be called with shm_lock held;
       *  => called from one place, thus, inline;
       */
      static inline int
      shmget_existing(struct lwp *l, const struct sys_shmget_args *uap, int mode,
          register_t *retval)
      {
              struct shmid_ds *shmseg;
              kauth_cred_t cred = l->l_cred;
              int segnum, error;
      again:
              KASSERT(mutex_owned(&shm_lock));
      
              /* Find segment by key */
              for (segnum = 0; segnum < shminfo.shmmni; segnum++)
                      if ((shmsegs[segnum].shm_perm.mode & SHMSEG_ALLOCATED) &&
                          shmsegs[segnum].shm_perm._key == SCARG(uap, key))
                              break;
              if (segnum == shminfo.shmmni) {
                      /* Not found */
                      return -1;
              }
      
              shmseg = &shmsegs[segnum];
              if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
                      /*
                       * This segment is in the process of being allocated.  Wait
                       * until it's done, and look the key up again (in case the
                       * allocation failed or it was freed).
                       */
                      shmseg->shm_perm.mode |= SHMSEG_WANTED;
                      error = cv_wait_sig(&shm_cv[segnum], &shm_lock);
                      if (error)
                              return error;
                      goto again;
              }
      
              /*
               * First check the flags, to generate a useful error when a
               * segment already exists.
               */
              if ((SCARG(uap, shmflg) & (IPC_CREAT | IPC_EXCL)) ==
                  (IPC_CREAT | IPC_EXCL))
                      return EEXIST;
      
              /* Check the permission and segment size. */
              error = ipcperm(cred, &shmseg->shm_perm, mode);
              if (error)
                      return error;
              if (SCARG(uap, size) && SCARG(uap, size) > shmseg->shm_segsz)
                      return EINVAL;
      
              *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
              return 0;
      }
      
      int
      sys_shmget(struct lwp *l, const struct sys_shmget_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(key_t) key;
                      syscallarg(size_t) size;
                      syscallarg(int) shmflg;
              } */
              struct shmid_ds *shmseg;
    2         kauth_cred_t cred = l->l_cred;
              key_t key = SCARG(uap, key);
              size_t size;
              int error, mode, segnum;
              bool lockmem;
      
              mode = SCARG(uap, shmflg) & ACCESSPERMS;
              if (SCARG(uap, shmflg) & _SHM_RMLINGER)
                      mode |= SHMSEG_RMLINGER;
      
              SHMPRINTF(("shmget: key 0x%lx size 0x%zx shmflg 0x%x mode 0x%x\n",
                  SCARG(uap, key), SCARG(uap, size), SCARG(uap, shmflg), mode));
      
    2         mutex_enter(&shm_lock);
              /* In case of reallocation, we will wait for completion */
              while (__predict_false(shm_realloc_state))
                      cv_wait(&shm_realloc_cv, &shm_lock);
      
    2         if (key != IPC_PRIVATE) {
                      error = shmget_existing(l, uap, mode, retval);
                      if (error != -1) {
                              mutex_exit(&shm_lock);
                              return error;
                      }
                      if ((SCARG(uap, shmflg) & IPC_CREAT) == 0) {
                              mutex_exit(&shm_lock);
                              return ENOENT;
                      }
              }
              error = 0;
      
              /*
               * Check the for the limits.
               */
    2         size = SCARG(uap, size);
    2         if (size < shminfo.shmmin || size > shminfo.shmmax) {
                      mutex_exit(&shm_lock);
    2                 return EINVAL;
              }
    2         if (shm_nused >= shminfo.shmmni) {
                      mutex_exit(&shm_lock);
                      return ENOSPC;
              }
    2         size = round_page(size);
              if (shm_committed + btoc(size) > shminfo.shmall) {
                      mutex_exit(&shm_lock);
                      return ENOMEM;
              }
      
              /* Find the first available segment */
    2         if (shm_last_free < 0) {
    2                 for (segnum = 0; segnum < shminfo.shmmni; segnum++)
    2                         if (shmsegs[segnum].shm_perm.mode & SHMSEG_FREE)
                                      break;
                      KASSERT(segnum < shminfo.shmmni);
              } else {
                      segnum = shm_last_free;
                      shm_last_free = -1;
              }
      
              /*
               * Initialize the segment.
               * We will drop the lock while allocating the memory, thus mark the
               * segment present, but removed, that no other thread could take it.
               * Also, disable reallocation, while lock is dropped.
               */
              shmseg = &shmsegs[segnum];
    2         shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
              shm_committed += btoc(size);
              shm_nused++;
              lockmem = shm_use_phys;
              shm_realloc_disable++;
              mutex_exit(&shm_lock);
      
              /* Allocate the memory object and lock it if needed */
              shmseg->_shm_internal = uao_create(size, 0);
              if (lockmem) {
                      /* Wire the pages and tag it */
                      error = uvm_obj_wirepages(shmseg->_shm_internal, 0, size, NULL);
                      if (error) {
                              uao_detach(shmseg->_shm_internal);
                              mutex_enter(&shm_lock);
                              shm_free_segment(segnum);
                              shm_realloc_disable--;
                              mutex_exit(&shm_lock);
                              return error;
                      }
              }
      
              /*
               * Please note, while segment is marked, there are no need to hold the
               * lock, while setting it (except shm_perm.mode).
               */
    2         shmseg->shm_perm._key = SCARG(uap, key);
              shmseg->shm_perm._seq = (shmseg->shm_perm._seq + 1) & 0x7fff;
              *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
      
              shmseg->shm_perm.cuid = shmseg->shm_perm.uid = kauth_cred_geteuid(cred);
              shmseg->shm_perm.cgid = shmseg->shm_perm.gid = kauth_cred_getegid(cred);
              shmseg->shm_segsz = SCARG(uap, size);
              shmseg->shm_cpid = l->l_proc->p_pid;
              shmseg->shm_lpid = shmseg->shm_nattch = 0;
              shmseg->shm_atime = shmseg->shm_dtime = 0;
              shmseg->shm_ctime = time_second;
      
              /*
               * Segment is initialized.
               * Enter the lock, mark as allocated, and notify waiters (if any).
               * Also, unmark the state of reallocation.
               */
              mutex_enter(&shm_lock);
              shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
                  (mode & (ACCESSPERMS | SHMSEG_RMLINGER)) |
    2             SHMSEG_ALLOCATED | (lockmem ? SHMSEG_WIRED : 0);
              if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
                      shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
                      cv_broadcast(&shm_cv[segnum]);
              }
    2         shm_realloc_disable--;
              cv_broadcast(&shm_realloc_cv);
              mutex_exit(&shm_lock);
      
              return error;
      }
      
      void
      shmfork(struct vmspace *vm1, struct vmspace *vm2)
      {
              struct shmmap_state *shmmap_s;
              struct shmmap_entry *shmmap_se;
      
              SHMPRINTF(("shmfork %p->%p\n", vm1, vm2));
              mutex_enter(&shm_lock);
              vm2->vm_shm = vm1->vm_shm;
              if (vm1->vm_shm) {
                      shmmap_s = (struct shmmap_state *)vm1->vm_shm;
                      SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
                              shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch++;
                      shmmap_s->nrefs++;
              }
              mutex_exit(&shm_lock);
      }
      
      void
      shmexit(struct vmspace *vm)
      {
              struct shmmap_state *shmmap_s;
              struct shmmap_entry *shmmap_se;
      
              mutex_enter(&shm_lock);
              shmmap_s = (struct shmmap_state *)vm->vm_shm;
              if (shmmap_s == NULL) {
                      mutex_exit(&shm_lock);
                      return;
              }
              vm->vm_shm = NULL;
      
              if (--shmmap_s->nrefs > 0) {
                      SHMPRINTF(("shmexit: vm %p drop ref (%d entries), refs = %d\n",
                          vm, shmmap_s->nitems, shmmap_s->nrefs));
                      SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) {
                              shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch--;
                      }
                      mutex_exit(&shm_lock);
                      return;
              }
      
              SHMPRINTF(("shmexit: vm %p cleanup (%d entries)\n", vm, shmmap_s->nitems));
              if (shmmap_s->nitems == 0) {
                      mutex_exit(&shm_lock);
                      kmem_free(shmmap_s, sizeof(struct shmmap_state));
                      return;
              }
      
              /*
               * Delete the entry from shm map.
               */
              for (;;) {
                      struct shmid_ds *shmseg;
                      struct uvm_object *uobj;
                      size_t sz;
      
                      shmmap_se = SLIST_FIRST(&shmmap_s->entries);
                      KASSERT(shmmap_se != NULL);
      
                      shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
                      sz = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
                      /* shm_delete_mapping() removes from the list. */
                      uobj = shm_delete_mapping(shmmap_s, shmmap_se);
                      mutex_exit(&shm_lock);
      
                      uvm_deallocate(&vm->vm_map, shmmap_se->va, sz);
                      if (uobj != NULL) {
                              uao_detach(uobj);
                      }
                      kmem_free(shmmap_se, sizeof(struct shmmap_entry));
      
                      if (SLIST_EMPTY(&shmmap_s->entries)) {
                              break;
                      }
                      mutex_enter(&shm_lock);
                      KASSERT(!SLIST_EMPTY(&shmmap_s->entries));
              }
              kmem_free(shmmap_s, sizeof(struct shmmap_state));
      }
      
      static int
      shmrealloc(int newshmni)
      {
              vaddr_t v;
              struct shmid_ds *oldshmsegs, *newshmsegs;
              kcondvar_t *newshm_cv, *oldshm_cv;
              size_t sz;
              int i, lsegid, oldshmni;
      
              if (newshmni < 1)
                      return EINVAL;
      
              /* Allocate new memory area */
              sz = ALIGN(newshmni * sizeof(struct shmid_ds)) +
                  ALIGN(newshmni * sizeof(kcondvar_t));
              sz = round_page(sz);
              v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
              if (v == 0)
                      return ENOMEM;
      
              mutex_enter(&shm_lock);
              while (shm_realloc_state || shm_realloc_disable)
                      cv_wait(&shm_realloc_cv, &shm_lock);
      
              /*
               * Get the number of last segment.  Fail we are trying to
               * reallocate less memory than we use.
               */
              lsegid = 0;
              for (i = 0; i < shminfo.shmmni; i++)
                      if ((shmsegs[i].shm_perm.mode & SHMSEG_FREE) == 0)
                              lsegid = i;
              if (lsegid >= newshmni) {
                      mutex_exit(&shm_lock);
                      uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
                      return EBUSY;
              }
              shm_realloc_state = true;
      
              newshmsegs = (void *)v;
              newshm_cv = (void *)((uintptr_t)newshmsegs +
                  ALIGN(newshmni * sizeof(struct shmid_ds)));
      
              /* Copy all memory to the new area */
              for (i = 0; i < shm_nused; i++) {
                      cv_init(&newshm_cv[i], "shmwait");
                      (void)memcpy(&newshmsegs[i], &shmsegs[i],
                          sizeof(newshmsegs[0]));
              }
      
              /* Mark as free all new segments, if there is any */
              for (; i < newshmni; i++) {
                      cv_init(&newshm_cv[i], "shmwait");
                      newshmsegs[i].shm_perm.mode = SHMSEG_FREE;
                      newshmsegs[i].shm_perm._seq = 0;
              }
      
              oldshmsegs = shmsegs;
              oldshmni = shminfo.shmmni;
              shminfo.shmmni = newshmni;
              shmsegs = newshmsegs;
              shm_cv = newshm_cv;
      
              /* Reallocation completed - notify all waiters, if any */
              shm_realloc_state = false;
              cv_broadcast(&shm_realloc_cv);
              mutex_exit(&shm_lock);
      
              /* Release now unused resources. */
              oldshm_cv = (void *)((uintptr_t)oldshmsegs +
                  ALIGN(oldshmni * sizeof(struct shmid_ds)));
              for (i = 0; i < oldshmni; i++)
                      cv_destroy(&oldshm_cv[i]);
      
              sz = ALIGN(oldshmni * sizeof(struct shmid_ds)) +
                  ALIGN(oldshmni * sizeof(kcondvar_t));
              sz = round_page(sz);
              uvm_km_free(kernel_map, (vaddr_t)oldshmsegs, sz, UVM_KMF_WIRED);
      
              return 0;
      }
      
      int
      shminit(struct sysctllog **clog)
      {
              vaddr_t v;
              size_t sz;
              int i;
      
              mutex_init(&shm_lock, MUTEX_DEFAULT, IPL_NONE);
              cv_init(&shm_realloc_cv, "shmrealc");
      
              /* Allocate the wired memory for our structures */
              sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
                  ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
              sz = round_page(sz);
              v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
              if (v == 0) {
                      printf("sysv_shm: cannot allocate memory");
                      return ENOMEM;
              }
              shmsegs = (void *)v;
              shm_cv = (void *)((uintptr_t)shmsegs +
                  ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)));
      
              if (shminfo.shmmax == 0)
                      shminfo.shmmax = uimax(physmem / 4, 1024) * PAGE_SIZE;
              else
                      shminfo.shmmax *= PAGE_SIZE;
              shminfo.shmall = shminfo.shmmax / PAGE_SIZE;
      
              for (i = 0; i < shminfo.shmmni; i++) {
                      cv_init(&shm_cv[i], "shmwait");
                      shmsegs[i].shm_perm.mode = SHMSEG_FREE;
                      shmsegs[i].shm_perm._seq = 0;
              }
              shm_last_free = 0;
              shm_nused = 0;
              shm_committed = 0;
              shm_realloc_disable = 0;
              shm_realloc_state = false;
      
              kern_has_sysvshm = 1;
      
              /* Load the callback function pointers for the uvm subsystem */
              uvm_shmexit = shmexit;
              uvm_shmfork = shmfork;
      
      #ifdef _MODULE
              if (clog)
                      sysctl_ipc_shm_setup(clog);
      #endif
              return 0;
      }
      
      int
      shmfini(void)
      {
              size_t sz;
              int i;
              vaddr_t v = (vaddr_t)shmsegs;
      
              mutex_enter(&shm_lock);
              if (shm_nused) {
                      mutex_exit(&shm_lock);
                      return 1;
              }
      
              /* Clear the callback function pointers for the uvm subsystem */
              uvm_shmexit = NULL;
              uvm_shmfork = NULL;
      
              /* Destroy all condvars */
              for (i = 0; i < shminfo.shmmni; i++)
                      cv_destroy(&shm_cv[i]);
              cv_destroy(&shm_realloc_cv);
      
              /* Free the allocated/wired memory */
              sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
                  ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
              sz = round_page(sz);
              uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
      
              /* Release and destroy our mutex */
              mutex_exit(&shm_lock);
              mutex_destroy(&shm_lock);
      
              kern_has_sysvshm = 0;
      
              return 0;
      }
      
      static int
      sysctl_ipc_shmmni(SYSCTLFN_ARGS)
      {
              int newsize, error;
              struct sysctlnode node;
              node = *rnode;
              node.sysctl_data = &newsize;
      
              newsize = shminfo.shmmni;
              error = sysctl_lookup(SYSCTLFN_CALL(&node));
              if (error || newp == NULL)
                      return error;
      
              sysctl_unlock();
              error = shmrealloc(newsize);
              sysctl_relock();
              return error;
      }
      
      static int
      sysctl_ipc_shmmaxpgs(SYSCTLFN_ARGS)
      {
              uint32_t newsize;
              int error;
              struct sysctlnode node;
              node = *rnode;
              node.sysctl_data = &newsize;
      
              newsize = shminfo.shmall;
              error = sysctl_lookup(SYSCTLFN_CALL(&node));
              if (error || newp == NULL)
                      return error;
      
              if (newsize < 1)
                      return EINVAL;
      
              shminfo.shmall = newsize;
              shminfo.shmmax = (uint64_t)shminfo.shmall * PAGE_SIZE;
      
              return 0;
      }
      
      static int
      sysctl_ipc_shmmax(SYSCTLFN_ARGS)
      {
              uint64_t newsize;
              int error;
              struct sysctlnode node;
              node = *rnode;
              node.sysctl_data = &newsize;
      
              newsize = shminfo.shmmax;
              error = sysctl_lookup(SYSCTLFN_CALL(&node));
              if (error || newp == NULL)
                      return error;
      
              if (newsize < PAGE_SIZE)
                      return EINVAL;
      
              shminfo.shmmax = round_page(newsize);
              shminfo.shmall = shminfo.shmmax >> PAGE_SHIFT;
      
              return 0;
      }
      
      SYSCTL_SETUP(sysctl_ipc_shm_setup, "sysctl kern.ipc subtree setup")
      {
      
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT,
                      CTLTYPE_NODE, "ipc",
                      SYSCTL_DESCR("SysV IPC options"),
                      NULL, 0, NULL, 0,
                      CTL_KERN, KERN_SYSVIPC, CTL_EOL);
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                      CTLTYPE_QUAD, "shmmax",
                      SYSCTL_DESCR("Max shared memory segment size in bytes"),
                      sysctl_ipc_shmmax, 0, &shminfo.shmmax, 0,
                      CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAX, CTL_EOL);
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                      CTLTYPE_INT, "shmmni",
                      SYSCTL_DESCR("Max number of shared memory identifiers"),
                      sysctl_ipc_shmmni, 0, &shminfo.shmmni, 0,
                      CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMNI, CTL_EOL);
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                      CTLTYPE_INT, "shmseg",
                      SYSCTL_DESCR("Max shared memory segments per process"),
                      NULL, 0, &shminfo.shmseg, 0,
                      CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMSEG, CTL_EOL);
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                      CTLTYPE_INT, "shmmaxpgs",
                      SYSCTL_DESCR("Max amount of shared memory in pages"),
                      sysctl_ipc_shmmaxpgs, 0, &shminfo.shmall, 0,
                      CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAXPGS, CTL_EOL);
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                      CTLTYPE_INT, "shm_use_phys",
                      SYSCTL_DESCR("Enable/disable locking of shared memory in "
                          "physical memory"), NULL, 0, &shm_use_phys, 0,
                      CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMUSEPHYS, CTL_EOL);
      }
      /*        $NetBSD: syscallvar.h,v 1.12 2018/04/19 21:19:07 christos Exp $        */
      
      /*-
       * Copyright (c) 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software developed for The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #ifndef _SYS_SYSCALLVAR_H_
      #define        _SYS_SYSCALLVAR_H_
      
      #ifndef _KERNEL
      #error nothing of interest to userspace here
      #endif
      
      #if defined(_KERNEL) && defined(_KERNEL_OPT)
      #include "opt_dtrace.h"
      #endif
      
      #include <sys/systm.h>
      #include <sys/proc.h>
      
      extern struct emul emul_netbsd;
      
      struct syscall_package {
              u_short                sp_code;
              u_short                sp_flags;
              sy_call_t        *sp_call;
      };
      
      void        syscall_init(void);
      int        syscall_establish(const struct emul *, const struct syscall_package *);
      int        syscall_disestablish(const struct emul *, const struct syscall_package *);
      
      static __inline int
      sy_call(const struct sysent *sy, struct lwp *l, const void *uap,
              register_t *rval)
      {
              int error;
      
    6         l->l_sysent = sy;
              error = (*sy->sy_call)(l, uap, rval);
    6         l->l_sysent = NULL;
      
              return error;
      }
      
      static __inline int
      sy_invoke(const struct sysent *sy, struct lwp *l, const void *uap,
              register_t *rval, int code)
      {
              const bool do_trace = l->l_proc->p_trace_enabled &&
                  (sy->sy_flags & SYCALL_INDIRECT) == 0;
              int error;
      
      #ifdef KDTRACE_HOOKS
      #define KDTRACE_ENTRY(a)        (a)
      #else
      #define KDTRACE_ENTRY(a)        (0)
      #endif
    6         if (__predict_true(!(do_trace || KDTRACE_ENTRY(sy->sy_entry)))
                  || (error = trace_enter(code, sy, uap)) == 0) {
    6                 rval[0] = 0;
      #if !defined(__mips__) && !defined(__m68k__)
                      /*
                       * Due to the mips userland code for SYS_break needing v1 to be
                       * preserved, we can't clear this on mips. 
                       */
                      rval[1] = 0;
      #endif
                      error = sy_call(sy, l, uap, rval);
              }
      
    6         if (__predict_false(do_trace || KDTRACE_ENTRY(sy->sy_return))) {
                      trace_exit(code, sy, uap, rval, error);
              }
              return error;
      }
      
      /* inclusion in the kernel currently depends on SYSCALL_DEBUG */
      extern const char * const syscallnames[];
      extern const char * const altsyscallnames[];
      
      #endif        /* _SYS_SYSCALLVAR_H_ */
      /*        $NetBSD: sys_sched.c,v 1.46 2016/07/30 15:38:17 christos Exp $        */
      
      /*
       * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
       * All rights reserved.
       * 
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      /*
       * System calls relating to the scheduler.
       *
       * Lock order:
       *
       *        cpu_lock ->
       *            proc_lock ->
       *                proc_t::p_lock ->
       *                    lwp_t::lwp_lock
       *
       * TODO:
       *  - Handle pthread_setschedprio() as defined by POSIX;
       *  - Handle sched_yield() case for SCHED_FIFO as defined by POSIX;
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.46 2016/07/30 15:38:17 christos Exp $");
      
      #include <sys/param.h>
      
      #include <sys/cpu.h>
      #include <sys/kauth.h>
      #include <sys/kmem.h>
      #include <sys/lwp.h>
      #include <sys/mutex.h>
      #include <sys/proc.h>
      #include <sys/pset.h>
      #include <sys/sched.h>
      #include <sys/syscallargs.h>
      #include <sys/sysctl.h>
      #include <sys/systm.h>
      #include <sys/types.h>
      #include <sys/unistd.h>
      
      static struct sysctllog *sched_sysctl_log;
      static kauth_listener_t sched_listener;
      
      /*
       * Convert user priority or the in-kernel priority or convert the current
       * priority to the appropriate range according to the policy change.
       */
      static pri_t
      convert_pri(lwp_t *l, int policy, pri_t pri)
      {
      
              /* Convert user priority to the in-kernel */
              if (pri != PRI_NONE) {
                      /* Only for real-time threads */
                      KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX);
                      KASSERT(policy != SCHED_OTHER);
                      return PRI_USER_RT + pri;
              }
      
              /* Neither policy, nor priority change */
              if (l->l_class == policy)
                      return l->l_priority;
      
              /* Time-sharing -> real-time */
              if (l->l_class == SCHED_OTHER) {
                      KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
                      return PRI_USER_RT;
              }
      
              /* Real-time -> time-sharing */
              if (policy == SCHED_OTHER) {
                      KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
                      /*
                       * this is a bit arbitrary because the priority is dynamic
                       * for SCHED_OTHER threads and will likely be changed by
                       * the scheduler soon anyway.
                       */
                      return l->l_priority - PRI_USER_RT;
              }
      
              /* Real-time -> real-time */
              return l->l_priority;
      }
      
      int
      do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
          const struct sched_param *params)
      {
              struct proc *p;
              struct lwp *t;
              pri_t pri;
              u_int lcnt;
              int error;
      
              error = 0;
      
              pri = params->sched_priority;
      
              /* If no parameters specified, just return (this should not happen) */
              if (pri == PRI_NONE && policy == SCHED_NONE)
                      return 0;
      
              /* Validate scheduling class */
              if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
                      return EINVAL;
      
              /* Validate priority */
              if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
                      return EINVAL;
      
              if (pid != 0) {
                      /* Find the process */
                      mutex_enter(proc_lock);
                      p = proc_find(pid);
                      if (p == NULL) {
                              mutex_exit(proc_lock);
                              return ESRCH;
                      }
                      mutex_enter(p->p_lock);
                      mutex_exit(proc_lock);
                      /* Disallow modification of system processes */
                      if ((p->p_flag & PK_SYSTEM) != 0) {
                              mutex_exit(p->p_lock);
                              return EPERM;
                      }
              } else {
                      /* Use the calling process */
                      p = curlwp->l_proc;
                      mutex_enter(p->p_lock);
              }
      
              /* Find the LWP(s) */
              lcnt = 0;
              LIST_FOREACH(t, &p->p_lwps, l_sibling) {
                      pri_t kpri;
                      int lpolicy;
      
                      if (lid && lid != t->l_lid)
                              continue;
      
                      lcnt++;
                      lwp_lock(t);
                      lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
      
                      /* Disallow setting of priority for SCHED_OTHER threads */
                      if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
                              lwp_unlock(t);
                              error = EINVAL;
                              break;
                      }
      
                      /* Convert priority, if needed */
                      kpri = convert_pri(t, lpolicy, pri);
      
                      /* Check the permission */
                      error = kauth_authorize_process(kauth_cred_get(),
                          KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
                          KAUTH_ARG(kpri));
                      if (error) {
                              lwp_unlock(t);
                              break;
                      }
      
                      /* Set the scheduling class, change the priority */
                      t->l_class = lpolicy;
                      lwp_changepri(t, kpri);
                      lwp_unlock(t);
              }
              mutex_exit(p->p_lock);
              return (lcnt == 0) ? ESRCH : error;
      }
      
      /*
       * Set scheduling parameters.
       */
      int
      sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(pid_t) pid;
                      syscallarg(lwpid_t) lid;
                      syscallarg(int) policy;
                      syscallarg(const struct sched_param *) params;
              } */
              struct sched_param params;
              int error;
      
              /* Get the parameters from the user-space */
              error = copyin(SCARG(uap, params), &params, sizeof(params));
              if (error)
                      goto out;
      
              error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
                  SCARG(uap, policy), &params);
      out:
              return error;
      }
      
      /*
       * do_sched_getparam:
       *
       * if lid=0, returns the parameter of the first LWP in the process.
       */
      int
      do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
          struct sched_param *params)
      {
              struct sched_param lparams;
              struct lwp *t;
              int error, lpolicy;
      
              t = lwp_find2(pid, lid); /* acquire p_lock */
              if (t == NULL)
                      return ESRCH;
      
              /* Check the permission */
              error = kauth_authorize_process(kauth_cred_get(),
                  KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
              if (error != 0) {
                      mutex_exit(t->l_proc->p_lock);
                      return error;
              }
      
              lwp_lock(t);
              lparams.sched_priority = t->l_priority;
              lpolicy = t->l_class;
              lwp_unlock(t);
              mutex_exit(t->l_proc->p_lock);
      
              /*
               * convert to the user-visible priority value.
               * it's an inversion of convert_pri().
               *
               * the SCHED_OTHER case is a bit arbitrary given that
               *        - we don't allow setting the priority.
               *        - the priority is dynamic.
               */
              switch (lpolicy) {
              case SCHED_OTHER:
                      lparams.sched_priority -= PRI_USER;
                      break;
              case SCHED_RR:
              case SCHED_FIFO:
                      lparams.sched_priority -= PRI_USER_RT;
                      break;
              }
      
              if (policy != NULL)
                      *policy = lpolicy;
      
              if (params != NULL)
                      *params = lparams;
      
              return error;
      }
      
      /*
       * Get scheduling parameters.
       */
      int
      sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(pid_t) pid;
                      syscallarg(lwpid_t) lid;
                      syscallarg(int *) policy;
                      syscallarg(struct sched_param *) params;
              } */
              struct sched_param params;
              int error, policy;
      
              error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
                  &params);
              if (error)
                      goto out;
      
              error = copyout(&params, SCARG(uap, params), sizeof(params));
              if (error == 0 && SCARG(uap, policy) != NULL)
                      error = copyout(&policy, SCARG(uap, policy), sizeof(int));
      out:
              return error;
      }
      
      /*
       * Allocate the CPU set, and get it from userspace.
       */
      static int
      genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
      {
              kcpuset_t *kset;
              int error;
      
              kcpuset_create(&kset, true);
              error = kcpuset_copyin(sset, kset, size);
              if (error) {
                      kcpuset_unuse(kset, NULL);
              } else {
                      *dset = kset;
              }
              return error;
      }
      
      /*
       * Set affinity.
       */
      int
      sys__sched_setaffinity(struct lwp *l,
          const struct sys__sched_setaffinity_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(pid_t) pid;
                      syscallarg(lwpid_t) lid;
                      syscallarg(size_t) size;
                      syscallarg(const cpuset_t *) cpuset;
              } */
              kcpuset_t *kcset, *kcpulst = NULL;
              struct cpu_info *ici, *ci;
              struct proc *p;
              struct lwp *t;
              CPU_INFO_ITERATOR cii;
              bool alloff;
              lwpid_t lid;
              u_int lcnt;
              int error;
      
              error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
              if (error)
                      return error;
      
              /*
               * Traverse _each_ CPU to:
               *  - Check that CPUs in the mask have no assigned processor set.
               *  - Check that at least one CPU from the mask is online.
               *  - Find the first target CPU to migrate.
               *
               * To avoid the race with CPU online/offline calls and processor sets,
               * cpu_lock will be locked for the entire operation.
               */
              ci = NULL;
              alloff = false;
              mutex_enter(&cpu_lock);
              for (CPU_INFO_FOREACH(cii, ici)) {
                      struct schedstate_percpu *ispc;
      
                      if (!kcpuset_isset(kcset, cpu_index(ici))) {
                              continue;
                      }
      
                      ispc = &ici->ci_schedstate;
                      /* Check that CPU is not in the processor-set */
                      if (ispc->spc_psid != PS_NONE) {
                              error = EPERM;
                              goto out;
                      }
                      /* Skip offline CPUs */
                      if (ispc->spc_flags & SPCF_OFFLINE) {
                              alloff = true;
                              continue;
                      }
                      /* Target CPU to migrate */
                      if (ci == NULL) {
                              ci = ici;
                      }
              }
              if (ci == NULL) {
                      if (alloff) {
                              /* All CPUs in the set are offline */
                              error = EPERM;
                              goto out;
                      }
                      /* Empty set */
                      kcpuset_unuse(kcset, &kcpulst);
                      kcset = NULL;
              }
      
              if (SCARG(uap, pid) != 0) {
                      /* Find the process */
                      mutex_enter(proc_lock);
                      p = proc_find(SCARG(uap, pid));
                      if (p == NULL) {
                              mutex_exit(proc_lock);
                              error = ESRCH;
                              goto out;
                      }
                      mutex_enter(p->p_lock);
                      mutex_exit(proc_lock);
                      /* Disallow modification of system processes. */
                      if ((p->p_flag & PK_SYSTEM) != 0) {
                              mutex_exit(p->p_lock);
                              error = EPERM;
                              goto out;
                      }
              } else {
                      /* Use the calling process */
                      p = l->l_proc;
                      mutex_enter(p->p_lock);
              }
      
              /*
               * Check the permission.
               */
              error = kauth_authorize_process(l->l_cred,
                  KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
              if (error != 0) {
                      mutex_exit(p->p_lock);
                      goto out;
              }
      
              /* Iterate through LWP(s). */
              lcnt = 0;
              lid = SCARG(uap, lid);
              LIST_FOREACH(t, &p->p_lwps, l_sibling) {
                      if (lid && lid != t->l_lid) {
                              continue;
                      }
                      lwp_lock(t);
                      /* No affinity for zombie LWPs. */
                      if (t->l_stat == LSZOMB) {
                              lwp_unlock(t);
                              continue;
                      }
                      /* First, release existing affinity, if any. */
                      if (t->l_affinity) {
                              kcpuset_unuse(t->l_affinity, &kcpulst);
                      }
                      if (kcset) {
                              /*
                               * Hold a reference on affinity mask, assign mask to
                               * LWP and migrate it to another CPU (unlocks LWP).
                               */
                              kcpuset_use(kcset);
                              t->l_affinity = kcset;
                              lwp_migrate(t, ci);
                      } else {
                              /* Old affinity mask is released, just clear. */
                              t->l_affinity = NULL;
                              lwp_unlock(t);
                      }
                      lcnt++;
              }
              mutex_exit(p->p_lock);
              if (lcnt == 0) {
                      error = ESRCH;
              }
      out:
              mutex_exit(&cpu_lock);
      
              /*
               * Drop the initial reference (LWPs, if any, have the ownership now),
               * and destroy whatever is in the G/C list, if filled.
               */
              if (kcset) {
                      kcpuset_unuse(kcset, &kcpulst);
              }
              if (kcpulst) {
                      kcpuset_destroy(kcpulst);
              }
              return error;
      }
      
      /*
       * Get affinity.
       */
      int
      sys__sched_getaffinity(struct lwp *l,
          const struct sys__sched_getaffinity_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(pid_t) pid;
                      syscallarg(lwpid_t) lid;
                      syscallarg(size_t) size;
                      syscallarg(cpuset_t *) cpuset;
              } */
              struct lwp *t;
              kcpuset_t *kcset;
              int error;
      
              error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
              if (error)
                      return error;
      
              /* Locks the LWP */
              t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
              if (t == NULL) {
                      error = ESRCH;
                      goto out;
              }
              /* Check the permission */
              if (kauth_authorize_process(l->l_cred,
                  KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
                      mutex_exit(t->l_proc->p_lock);
                      error = EPERM;
                      goto out;
              }
              lwp_lock(t);
              if (t->l_affinity) {
                      kcpuset_copy(kcset, t->l_affinity);
              } else {
                      kcpuset_zero(kcset);
              }
              lwp_unlock(t);
              mutex_exit(t->l_proc->p_lock);
      
              error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
      out:
              kcpuset_unuse(kcset, NULL);
              return error;
      }
      
      /*
       * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak
       * analogue of priority inheritance: temp raise the priority
       * of the caller when accessing a protected resource.
       */
      int 
      sys__sched_protect(struct lwp *l, 
          const struct sys__sched_protect_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(int) priority;
                      syscallarg(int *) opriority;
              } */
              int error;
              pri_t pri;
      
              KASSERT(l->l_inheritedprio == -1);
              KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio);
              
              pri = SCARG(uap, priority);
              error = 0;
              lwp_lock(l);
              if (pri == -1) {
                      /* back out priority changes */
                      switch(l->l_protectdepth) {
                      case 0:
                              error = EINVAL;
                              break;
                      case 1:
                              l->l_protectdepth = 0;
                              l->l_protectprio = -1;
                              l->l_auxprio = -1;
                              break;
                      default:
                              l->l_protectdepth--;
                              break;
                      }
              } else if (pri < 0) {
                      /* Just retrieve the current value, for debugging */
                      if (l->l_protectprio == -1)
                              error = ENOENT;
                      else
                              *retval = l->l_protectprio - PRI_USER_RT;
              } else if (__predict_false(pri < SCHED_PRI_MIN ||
                  pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) {
                      /* must fail if existing priority is higher */
                      error = EPERM;
              } else {
                      /* play along but make no changes if not a realtime LWP. */
                      l->l_protectdepth++;
                      pri += PRI_USER_RT;
                      if (__predict_true(l->l_class != SCHED_OTHER && 
                          pri > l->l_protectprio)) {
                              l->l_protectprio = pri;
                              l->l_auxprio = pri;
                      }
              }
              lwp_unlock(l);
      
              return error;
      }
      
      /*
       * Yield.
       */
      int
      sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
      {
      
              yield();
              return 0;
      }
      
      /*
       * Sysctl nodes and initialization.
       */
      static void
      sysctl_sched_setup(struct sysctllog **clog)
      {
              const struct sysctlnode *node = NULL;
      
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                      CTLTYPE_INT, "posix_sched",
                      SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
                                   "Process Scheduling option to which the "
                                   "system attempts to conform"),
                      NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
                      CTL_KERN, CTL_CREATE, CTL_EOL);
              sysctl_createv(clog, 0, NULL, &node,
                      CTLFLAG_PERMANENT,
                      CTLTYPE_NODE, "sched",
                      SYSCTL_DESCR("Scheduler options"),
                      NULL, 0, NULL, 0,
                      CTL_KERN, CTL_CREATE, CTL_EOL);
      
              if (node == NULL)
                      return;
      
              sysctl_createv(clog, 0, &node, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
                      CTLTYPE_INT, "pri_min",
                      SYSCTL_DESCR("Minimal POSIX real-time priority"),
                      NULL, SCHED_PRI_MIN, NULL, 0,
                      CTL_CREATE, CTL_EOL);
              sysctl_createv(clog, 0, &node, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
                      CTLTYPE_INT, "pri_max",
                      SYSCTL_DESCR("Maximal POSIX real-time priority"),
                      NULL, SCHED_PRI_MAX, NULL, 0,
                      CTL_CREATE, CTL_EOL);
      }
      
      static int
      sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
          void *arg0, void *arg1, void *arg2, void *arg3)
      {
              struct proc *p;
              int result;
      
              result = KAUTH_RESULT_DEFER;
              p = arg0;
      
    1         switch (action) {
              case KAUTH_PROCESS_SCHEDULER_GETPARAM:
                      if (kauth_cred_uidmatch(cred, p->p_cred))
                              result = KAUTH_RESULT_ALLOW;
                      break;
      
              case KAUTH_PROCESS_SCHEDULER_SETPARAM:
                      if (kauth_cred_uidmatch(cred, p->p_cred)) {
                              struct lwp *l;
                              int policy;
                              pri_t priority;
      
                              l = arg1;
                              policy = (int)(unsigned long)arg2;
                              priority = (pri_t)(unsigned long)arg3;
      
                              if ((policy == l->l_class ||
                                  (policy != SCHED_FIFO && policy != SCHED_RR)) &&
                                  priority <= l->l_priority)
                                      result = KAUTH_RESULT_ALLOW;
                      }
      
                      break;
      
              case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
                      result = KAUTH_RESULT_ALLOW;
                      break;
      
              case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
                      /* Privileged; we let the secmodel handle this. */
                      break;
      
              default:
                      break;
              }
      
    1         return result;
      }
      
      void
      sched_init(void)
      {
      
              sysctl_sched_setup(&sched_sysctl_log);
      
              sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
                  sched_listener_cb, NULL);
      }
      /*        $NetBSD: kern_mutex_obj.c,v 1.6 2018/02/05 04:25:04 ozaki-r Exp $        */
      
      /*-
       * Copyright (c) 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_mutex_obj.c,v 1.6 2018/02/05 04:25:04 ozaki-r Exp $");
      
      #include <sys/param.h>
      #include <sys/atomic.h>
      #include <sys/mutex.h>
      #include <sys/pool.h>
      
      /* Mutex cache */
      #define        MUTEX_OBJ_MAGIC        0x5aa3c85d
      struct kmutexobj {
              kmutex_t        mo_lock;
              u_int                mo_magic;
              u_int                mo_refcnt;
      };
      
      static int        mutex_obj_ctor(void *, void *, int);
      
      static pool_cache_t        mutex_obj_cache                __read_mostly;
      
      /*
       * mutex_obj_init:
       *
       *        Initialize the mutex object store.
       */
      void
      mutex_obj_init(void)
      {
      
              mutex_obj_cache = pool_cache_init(sizeof(struct kmutexobj),
                  coherency_unit, 0, 0, "mutex", NULL, IPL_NONE, mutex_obj_ctor,
                  NULL, NULL);
      }
      
      /*
       * mutex_obj_ctor:
       *
       *        Initialize a new lock for the cache.
       */
      static int
      mutex_obj_ctor(void *arg, void *obj, int flags)
      {
              struct kmutexobj * mo = obj;
      
              mo->mo_magic = MUTEX_OBJ_MAGIC;
      
              return 0;
      }
      
      /*
       * mutex_obj_alloc:
       *
       *        Allocate a single lock object.
       */
      kmutex_t *
      mutex_obj_alloc(kmutex_type_t type, int ipl)
      {
              struct kmutexobj *mo;
              extern void _mutex_init(kmutex_t *, kmutex_type_t, int, uintptr_t);
      
    3         mo = pool_cache_get(mutex_obj_cache, PR_WAITOK);
              _mutex_init(&mo->mo_lock, type, ipl,
                  (uintptr_t)__builtin_return_address(0));
              mo->mo_refcnt = 1;
      
              return (kmutex_t *)mo;
      }
      
      /*
       * mutex_obj_hold:
       *
       *        Add a single reference to a lock object.  A reference to the object
       *        must already be held, and must be held across this call.
       */
      void
      mutex_obj_hold(kmutex_t *lock)
      {
              struct kmutexobj *mo = (struct kmutexobj *)lock;
      
              KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
                  "%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
                   __func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
              KASSERTMSG(mo->mo_refcnt > 0,
                  "%s: lock %p: mo->mo_refcnt (%#x) == 0",
                   __func__, mo, mo->mo_refcnt);
      
              atomic_inc_uint(&mo->mo_refcnt);
      }
      
      /*
       * mutex_obj_free:
       *
       *        Drop a reference from a lock object.  If the last reference is being
       *        dropped, free the object and return true.  Otherwise, return false.
       */
      bool
      mutex_obj_free(kmutex_t *lock)
      {
              struct kmutexobj *mo = (struct kmutexobj *)lock;
      
              KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
                  "%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
                   __func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
              KASSERTMSG(mo->mo_refcnt > 0,
                  "%s: lock %p: mo->mo_refcnt (%#x) == 0",
                   __func__, mo, mo->mo_refcnt);
      
              if (atomic_dec_uint_nv(&mo->mo_refcnt) > 0) {
                      return false;
              }
              mutex_destroy(&mo->mo_lock);
              pool_cache_put(mutex_obj_cache, mo);
              return true;
      }
      /*        $NetBSD: kern_condvar.c,v 1.41 2018/01/30 07:52:22 ozaki-r Exp $        */
      
      /*-
       * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Kernel condition variable implementation.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.41 2018/01/30 07:52:22 ozaki-r Exp $");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/lwp.h>
      #include <sys/condvar.h>
      #include <sys/sleepq.h>
      #include <sys/lockdebug.h>
      #include <sys/cpu.h>
      #include <sys/kernel.h>
      
      /*
       * Accessors for the private contents of the kcondvar_t data type.
       *
       *        cv_opaque[0]        sleepq...
       *        cv_opaque[1]        ...pointers
       *        cv_opaque[2]        description for ps(1)
       *
       * cv_opaque[0..1] is protected by the interlock passed to cv_wait() (enqueue
       * only), and the sleep queue lock acquired with sleeptab_lookup() (enqueue
       * and dequeue).
       *
       * cv_opaque[2] (the wmesg) is static and does not change throughout the life
       * of the CV.
       */
      #define        CV_SLEEPQ(cv)                ((sleepq_t *)(cv)->cv_opaque)
      #define        CV_WMESG(cv)                ((const char *)(cv)->cv_opaque[2])
      #define        CV_SET_WMESG(cv, v)         (cv)->cv_opaque[2] = __UNCONST(v)
      
      #define        CV_DEBUG_P(cv)        (CV_WMESG(cv) != nodebug)
      #define        CV_RA                ((uintptr_t)__builtin_return_address(0))
      
      static void                cv_unsleep(lwp_t *, bool);
      static inline void        cv_wakeup_one(kcondvar_t *);
      static inline void        cv_wakeup_all(kcondvar_t *);
      
      static syncobj_t cv_syncobj = {
              .sobj_flag        = SOBJ_SLEEPQ_SORTED,
              .sobj_unsleep        = cv_unsleep,
              .sobj_changepri        = sleepq_changepri,
              .sobj_lendpri        = sleepq_lendpri,
              .sobj_owner        = syncobj_noowner,
      };
      
      lockops_t cv_lockops = {
              .lo_name = "Condition variable",
              .lo_type = LOCKOPS_CV,
              .lo_dump = NULL,
      };
      
      static const char deadcv[] = "deadcv";
      #ifdef LOCKDEBUG
      static const char nodebug[] = "nodebug";
      
      #define CV_LOCKDEBUG_HANDOFF(l, cv) cv_lockdebug_handoff(l, cv)
      #define CV_LOCKDEBUG_PROCESS(l, cv) cv_lockdebug_process(l, cv)
      
      static inline void
      cv_lockdebug_handoff(lwp_t *l, kcondvar_t *cv)
      {
      
              if (CV_DEBUG_P(cv))
                      l->l_flag |= LW_CVLOCKDEBUG;
      }
      
      static inline void
      cv_lockdebug_process(lwp_t *l, kcondvar_t *cv)
      {
      
              if ((l->l_flag & LW_CVLOCKDEBUG) == 0)
                      return;
      
              l->l_flag &= ~LW_CVLOCKDEBUG;
              LOCKDEBUG_UNLOCKED(true, cv, CV_RA, 0);
      }
      #else
      #define CV_LOCKDEBUG_HANDOFF(l, cv) __nothing
      #define CV_LOCKDEBUG_PROCESS(l, cv) __nothing
      #endif
      
      /*
       * cv_init:
       *
       *        Initialize a condition variable for use.
       */
      void
      cv_init(kcondvar_t *cv, const char *wmesg)
      {
      #ifdef LOCKDEBUG
              bool dodebug;
      
              dodebug = LOCKDEBUG_ALLOC(cv, &cv_lockops,
                  (uintptr_t)__builtin_return_address(0));
              if (!dodebug) {
                      /* XXX This will break vfs_lockf. */
                      wmesg = nodebug;
              }
      #endif
              KASSERT(wmesg != NULL);
              CV_SET_WMESG(cv, wmesg);
              sleepq_init(CV_SLEEPQ(cv));
      }
      
      /*
       * cv_destroy:
       *
       *        Tear down a condition variable.
       */
      void
      cv_destroy(kcondvar_t *cv)
      {
      
              LOCKDEBUG_FREE(CV_DEBUG_P(cv), cv);
      #ifdef DIAGNOSTIC
              KASSERT(cv_is_valid(cv));
              CV_SET_WMESG(cv, deadcv);
      #endif
      }
      
      /*
       * cv_enter:
       *
       *        Look up and lock the sleep queue corresponding to the given
       *        condition variable, and increment the number of waiters.
       */
      static inline void
      cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l)
      {
              sleepq_t *sq;
              kmutex_t *mp;
      
              KASSERT(cv_is_valid(cv));
              KASSERT(!cpu_intr_p());
              KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
      
              LOCKDEBUG_LOCKED(CV_DEBUG_P(cv), cv, mtx, CV_RA, 0);
      
              l->l_kpriority = true;
              mp = sleepq_hashlock(cv);
              sq = CV_SLEEPQ(cv);
              sleepq_enter(sq, l, mp);
              sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj);
              mutex_exit(mtx);
              KASSERT(cv_has_waiters(cv));
      }
      
      /*
       * cv_exit:
       *
       *        After resuming execution, check to see if we have been restarted
       *        as a result of cv_signal().  If we have, but cannot take the
       *        wakeup (because of eg a pending Unix signal or timeout) then try
       *        to ensure that another LWP sees it.  This is necessary because
       *        there may be multiple waiters, and at least one should take the
       *        wakeup if possible.
       */
      static inline int
      cv_exit(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, const int error)
      {
      
              mutex_enter(mtx);
              if (__predict_false(error != 0))
                      cv_signal(cv);
      
              LOCKDEBUG_UNLOCKED(CV_DEBUG_P(cv), cv, CV_RA, 0);
              KASSERT(cv_is_valid(cv));
      
              return error;
      }
      
      /*
       * cv_unsleep:
       *
       *        Remove an LWP from the condition variable and sleep queue.  This
       *        is called when the LWP has not been awoken normally but instead
       *        interrupted: for example, when a signal is received.  Must be
       *        called with the LWP locked, and must return it unlocked.
       */
      static void
      cv_unsleep(lwp_t *l, bool cleanup)
      {
              kcondvar_t *cv __diagused;
      
              cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
      
              KASSERT(l->l_wchan == (wchan_t)cv);
              KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
              KASSERT(cv_is_valid(cv));
              KASSERT(cv_has_waiters(cv));
      
              sleepq_unsleep(l, cleanup);
      }
      
      /*
       * cv_wait:
       *
       *        Wait non-interruptably on a condition variable until awoken.
       */
      void
      cv_wait(kcondvar_t *cv, kmutex_t *mtx)
      {
              lwp_t *l = curlwp;
      
              KASSERT(mutex_owned(mtx));
      
              cv_enter(cv, mtx, l);
      
              /*
               * We can't use cv_exit() here since the cv might be destroyed before
               * this thread gets a chance to run.  Instead, hand off the lockdebug
               * responsibility to the thread that wakes us up.
               */
      
              CV_LOCKDEBUG_HANDOFF(l, cv);
              (void)sleepq_block(0, false);
              mutex_enter(mtx);
      }
      
      /*
       * cv_wait_sig:
       *
       *        Wait on a condition variable until a awoken or a signal is received. 
       *        Will also return early if the process is exiting.  Returns zero if
       *        awoken normally, ERESTART if a signal was received and the system
       *        call is restartable, or EINTR otherwise.
       */
      int
      cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
      {
              lwp_t *l = curlwp;
              int error;
      
              KASSERT(mutex_owned(mtx));
      
              cv_enter(cv, mtx, l);
              error = sleepq_block(0, true);
              return cv_exit(cv, mtx, l, error);
      }
      
      /*
       * cv_timedwait:
       *
       *        Wait on a condition variable until awoken or the specified timeout
       *        expires.  Returns zero if awoken normally or EWOULDBLOCK if the
       *        timeout expired.
       *
       *        timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
       */
      int
      cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
      {
              lwp_t *l = curlwp;
              int error;
      
              KASSERT(mutex_owned(mtx));
      
              cv_enter(cv, mtx, l);
              error = sleepq_block(timo, false);
              return cv_exit(cv, mtx, l, error);
      }
      
      /*
       * cv_timedwait_sig:
       *
       *        Wait on a condition variable until a timeout expires, awoken or a
       *        signal is received.  Will also return early if the process is
       *        exiting.  Returns zero if awoken normally, EWOULDBLOCK if the
       *        timeout expires, ERESTART if a signal was received and the system
       *        call is restartable, or EINTR otherwise.
       *
       *        timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
       */
      int
      cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
      {
              lwp_t *l = curlwp;
              int error;
      
              KASSERT(mutex_owned(mtx));
      
              cv_enter(cv, mtx, l);
              error = sleepq_block(timo, true);
              return cv_exit(cv, mtx, l, error);
      }
      
      /*
       * Given a number of seconds, sec, and 2^64ths of a second, frac, we
       * want a number of ticks for a timeout:
       *
       *        timo = hz*(sec + frac/2^64)
       *             = hz*sec + hz*frac/2^64
       *             = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
       *             = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
       *
       * where frachi is the high 32 bits of frac and fraclo is the
       * low 32 bits.
       *
       * We assume hz < INT_MAX/2 < UINT32_MAX, so
       *
       *        hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
       *
       * since fraclo < 2^32.
       *
       * We clamp the result at INT_MAX/2 for a timeout in ticks, since we
       * can't represent timeouts higher than INT_MAX in cv_timedwait, and
       * spurious wakeup is OK.  Moreover, we don't want to wrap around,
       * because we compute end - start in ticks in order to compute the
       * remaining timeout, and that difference cannot wrap around, so we use
       * a timeout less than INT_MAX.  Using INT_MAX/2 provides plenty of
       * margin for paranoia and will exceed most waits in practice by far.
       */
      static unsigned
      bintime2timo(const struct bintime *bt)
      {
      
              KASSERT(hz < INT_MAX/2);
              CTASSERT(INT_MAX/2 < UINT32_MAX);
              if (bt->sec > ((INT_MAX/2)/hz))
                      return INT_MAX/2;
              if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
                      return INT_MAX/2;
      
              return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
      }
      
      /*
       * timo is in units of ticks.  We want units of seconds and 2^64ths of
       * a second.  We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
       * second), from which we can conclude 2^64 / hz = 1 (2^64th of a
       * second)/tick.  So for the fractional part, we compute
       *
       *        frac = rem * 2^64 / hz
       *             = ((rem * 2^32) / hz) * 2^32
       *
       * Using truncating integer division instead of real division will
       * leave us with only about 32 bits of precision, which means about
       * 1/4-nanosecond resolution, which is good enough for our purposes.
       */
      static struct bintime
      timo2bintime(unsigned timo)
      {
      
              return (struct bintime) {
                      .sec = timo / hz,
                      .frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
              };
      }
      
      /*
       * cv_timedwaitbt:
       *
       *        Wait on a condition variable until awoken or the specified
       *        timeout expires.  Returns zero if awoken normally or
       *        EWOULDBLOCK if the timeout expires.
       *
       *        On entry, bt is a timeout in bintime.  cv_timedwaitbt subtracts
       *        the time slept, so on exit, bt is the time remaining after
       *        sleeping, possibly negative if the complete time has elapsed.
       *        No infinite timeout; use cv_wait_sig instead.
       *
       *        epsilon is a requested maximum error in timeout (excluding
       *        spurious wakeups).  Currently not used, will be used in the
       *        future to choose between low- and high-resolution timers.
       *        Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
       *        where r is the finest resolution of clock available and s is
       *        scheduling delays for scheduler overhead and competing threads.
       *        Time is measured by the interrupt source implementing the
       *        timeout, not by another timecounter.
       */
      int
      cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
          const struct bintime *epsilon __diagused)
      {
              struct bintime slept;
              unsigned start, end;
              int error;
      
              KASSERTMSG(bt->sec >= 0, "negative timeout");
              KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
      
              /*
               * hardclock_ticks is technically int, but nothing special
               * happens instead of overflow, so we assume two's-complement
               * wraparound and just treat it as unsigned.
               */
              start = hardclock_ticks;
              error = cv_timedwait(cv, mtx, bintime2timo(bt));
              end = hardclock_ticks;
      
              slept = timo2bintime(end - start);
              /* bt := bt - slept */
              bintime_sub(bt, &slept);
      
              return error;
      }
      
      /*
       * cv_timedwaitbt_sig:
       *
       *        Wait on a condition variable until awoken, the specified
       *        timeout expires, or interrupted by a signal.  Returns zero if
       *        awoken normally, EWOULDBLOCK if the timeout expires, or
       *        EINTR/ERESTART if interrupted by a signal.
       *
       *        On entry, bt is a timeout in bintime.  cv_timedwaitbt_sig
       *        subtracts the time slept, so on exit, bt is the time remaining
       *        after sleeping.  No infinite timeout; use cv_wait instead.
       *
       *        epsilon is a requested maximum error in timeout (excluding
       *        spurious wakeups).  Currently not used, will be used in the
       *        future to choose between low- and high-resolution timers.
       */
      int
      cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
          const struct bintime *epsilon __diagused)
      {
              struct bintime slept;
              unsigned start, end;
              int error;
      
              KASSERTMSG(bt->sec >= 0, "negative timeout");
              KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
      
              /*
               * hardclock_ticks is technically int, but nothing special
               * happens instead of overflow, so we assume two's-complement
               * wraparound and just treat it as unsigned.
               */
              start = hardclock_ticks;
              error = cv_timedwait_sig(cv, mtx, bintime2timo(bt));
              end = hardclock_ticks;
      
              slept = timo2bintime(end - start);
              /* bt := bt - slept */
              bintime_sub(bt, &slept);
      
              return error;
      }
      
      /*
       * cv_signal:
       *
       *        Wake the highest priority LWP waiting on a condition variable.
       *        Must be called with the interlocking mutex held.
       */
      void
      cv_signal(kcondvar_t *cv)
      {
      
              /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
              KASSERT(cv_is_valid(cv));
      
              if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))
                      cv_wakeup_one(cv);
      }
      
      static inline void
      cv_wakeup_one(kcondvar_t *cv)
      {
              sleepq_t *sq;
              kmutex_t *mp;
              lwp_t *l;
      
              KASSERT(cv_is_valid(cv));
      
              mp = sleepq_hashlock(cv);
              sq = CV_SLEEPQ(cv);
              l = TAILQ_FIRST(sq);
              if (l == NULL) {
                      mutex_spin_exit(mp);
                      return;
              }
              KASSERT(l->l_sleepq == sq);
              KASSERT(l->l_mutex == mp);
              KASSERT(l->l_wchan == cv);
              CV_LOCKDEBUG_PROCESS(l, cv);
              sleepq_remove(sq, l);
              mutex_spin_exit(mp);
      
              KASSERT(cv_is_valid(cv));
      }
      
      /*
       * cv_broadcast:
       *
       *        Wake all LWPs waiting on a condition variable.  Must be called
       *        with the interlocking mutex held.
       */
      void
      cv_broadcast(kcondvar_t *cv)
      {
      
              /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
    3         KASSERT(cv_is_valid(cv));
      
    3         if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))  
                      cv_wakeup_all(cv);
      }
      
      static inline void
      cv_wakeup_all(kcondvar_t *cv)
      {
              sleepq_t *sq;
              kmutex_t *mp;
              lwp_t *l, *next;
      
              KASSERT(cv_is_valid(cv));
      
              mp = sleepq_hashlock(cv);
              sq = CV_SLEEPQ(cv);
              for (l = TAILQ_FIRST(sq); l != NULL; l = next) {
                      KASSERT(l->l_sleepq == sq);
                      KASSERT(l->l_mutex == mp);
                      KASSERT(l->l_wchan == cv);
                      next = TAILQ_NEXT(l, l_sleepchain);
                      CV_LOCKDEBUG_PROCESS(l, cv);
                      sleepq_remove(sq, l);
              }
              mutex_spin_exit(mp);
      
              KASSERT(cv_is_valid(cv));
      }
      
      /*
       * cv_has_waiters:
       *
       *        For diagnostic assertions: return non-zero if a condition
       *        variable has waiters.
       */
      bool
      cv_has_waiters(kcondvar_t *cv)
      {
      
              return !TAILQ_EMPTY(CV_SLEEPQ(cv));
      }
      
      /*
       * cv_is_valid:
       *
       *        For diagnostic assertions: return non-zero if a condition
       *        variable appears to be valid.  No locks need be held.
       */
      bool
      cv_is_valid(kcondvar_t *cv)
      {
      
    3         return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
      }
      /*        $NetBSD: kern_rwlock.c,v 1.54 2019/05/09 05:00:31 ozaki-r Exp $        */
      
      /*-
       * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Jason R. Thorpe and Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Kernel reader/writer lock implementation, modeled after those
       * found in Solaris, a description of which can be found in:
       *
       *        Solaris Internals: Core Kernel Architecture, Jim Mauro and
       *            Richard McDougall.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.54 2019/05/09 05:00:31 ozaki-r Exp $");
      
      #define        __RWLOCK_PRIVATE
      
      #include <sys/param.h>
      #include <sys/proc.h>
      #include <sys/rwlock.h>
      #include <sys/sched.h>
      #include <sys/sleepq.h>
      #include <sys/systm.h>
      #include <sys/lockdebug.h>
      #include <sys/cpu.h>
      #include <sys/atomic.h>
      #include <sys/lock.h>
      #include <sys/pserialize.h>
      
      #include <dev/lockstat.h>
      
      /*
       * LOCKDEBUG
       */
      
      #if defined(LOCKDEBUG)
      
      #define        RW_WANTLOCK(rw, op)                                                \
              LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw),                        \
                  (uintptr_t)__builtin_return_address(0), op == RW_READER);
      #define        RW_LOCKED(rw, op)                                                \
              LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL,                        \
                  (uintptr_t)__builtin_return_address(0), op == RW_READER);
      #define        RW_UNLOCKED(rw, op)                                                \