/*        $NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $        */
      
      /*-
       * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Shared support code for kernels built with the DEBUG option.
       */
       
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $");
      
      #include "opt_ddb.h"
      
      #include <sys/param.h>
      #include <sys/proc.h>
      #include <sys/systm.h>
      #include <sys/kmem.h>
      #include <sys/debug.h>
      #include <sys/atomic.h>
      #include <sys/cpu.h>
      
      #include <uvm/uvm_extern.h>
      
      #include <machine/lock.h>
      
      /*
       * Allocation/free validation by pointer address.  Introduces
       * significant overhead and is not enabled by default.  Patch
       * `debug_freecheck' to 1 at boot time to enable.
       */
      #define        FREECHECK_BYTES                (8*1024*1024)
      
      typedef struct fcitem {
              void                *i_addr;
              struct fcitem        *i_next;
      } fcitem_t;
      
      fcitem_t                *freecheck_free;
      __cpu_simple_lock_t        freecheck_lock;
      u_int                        debug_freecheck;
      
      void
      debug_init(void)
      {
              size_t cnt;
              fcitem_t *i;
      
              __cpu_simple_lock_init(&freecheck_lock);
      
              if (debug_freecheck) {
                      i = (fcitem_t *)uvm_km_alloc(kernel_map, FREECHECK_BYTES, 0,
                          UVM_KMF_WIRED);
                      if (i == NULL) {
                              printf("freecheck_init: unable to allocate memory");
                              return;
                      }
      
                      for (cnt = FREECHECK_BYTES / sizeof(*i); cnt != 0; cnt--) {
                              i->i_next = freecheck_free;
                              freecheck_free = i++;
                      }
              }
      }
      
      void
      freecheck_out(void **head, void *addr)
      {
              fcitem_t *i;
              int s;
      
 1051         if (!debug_freecheck)
                      return;
      
              s = splvm();
              __cpu_simple_lock(&freecheck_lock);
              for (i = *head; i != NULL; i = i->i_next) {
                      if (i->i_addr != addr)
                              continue;
                      __cpu_simple_unlock(&freecheck_lock);
                      splx(s);
                      panic("freecheck_out: %p already out", addr);
              }
              if ((i = freecheck_free) != NULL) {
                      freecheck_free = i->i_next;
                      i->i_addr = addr;
                      i->i_next = *head;
                      *head = i;
              }
              __cpu_simple_unlock(&freecheck_lock);
              splx(s);
      
              if (i == NULL) {
                      if (atomic_swap_uint(&debug_freecheck, 1) == 0)
                              printf("freecheck_out: no more slots\n");
              }
      }
      
      void
      freecheck_in(void **head, void *addr)
      {
              fcitem_t *i;
              void *pp;
              int s;
      
  849         if (!debug_freecheck)
                      return;
      
              s = splvm();
              __cpu_simple_lock(&freecheck_lock);
              for (i = *head, pp = head; i != NULL; pp = &i->i_next, i = i->i_next) {
                      if (i->i_addr == addr) {
                              *(fcitem_t **)pp = i->i_next;
                              i->i_next = freecheck_free;
                              freecheck_free = i;
                              break;
                      }
              }
              __cpu_simple_unlock(&freecheck_lock);
              splx(s);
      
              if (i != NULL)
                      return;
      
      #ifdef DDB
              printf("freecheck_in: %p not out\n", addr);
              Debugger();
      #else
              panic("freecheck_in: %p not out", addr);
      #endif
      }
      /*        $NetBSD: sys_sig.c,v 1.48 2019/09/08 07:00:20 maxv Exp $        */
      
      /*-
       * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Copyright (c) 1982, 1986, 1989, 1991, 1993
       *        The Regents of the University of California.  All rights reserved.
       * (c) UNIX System Laboratories, Inc.
       * All or some portions of this file are derived from material licensed
       * to the University of California by American Telephone and Telegraph
       * Co. or Unix System Laboratories, Inc. and are reproduced herein with
       * the permission of UNIX System Laboratories, Inc.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)kern_sig.c        8.14 (Berkeley) 5/14/95
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: sys_sig.c,v 1.48 2019/09/08 07:00:20 maxv Exp $");
      
      #include "opt_dtrace.h"
      
      #include <sys/param.h>
      #include <sys/kernel.h>
      #include <sys/signalvar.h>
      #include <sys/proc.h>
      #include <sys/pool.h>
      #include <sys/syscallargs.h>
      #include <sys/kauth.h>
      #include <sys/wait.h>
      #include <sys/kmem.h>
      #include <sys/module.h>
      #include <sys/sdt.h>
      
      SDT_PROVIDER_DECLARE(proc);
      SDT_PROBE_DEFINE2(proc, kernel, , signal__clear,
          "int",                 /* signal */
          "ksiginfo_t *");        /* signal-info */
      
      int
      sys___sigaction_sigtramp(struct lwp *l,
          const struct sys___sigaction_sigtramp_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(int)                                signum;
                      syscallarg(const struct sigaction *)        nsa;
                      syscallarg(struct sigaction *)                osa;
                      syscallarg(void *)                        tramp;
                      syscallarg(int)                                vers;
              } */
              struct sigaction nsa, osa;
              int error;
      
              if (SCARG(uap, nsa)) {
                      error = copyin(SCARG(uap, nsa), &nsa, sizeof(nsa));
                      if (error)
                              return (error);
              }
              error = sigaction1(l, SCARG(uap, signum),
                  SCARG(uap, nsa) ? &nsa : 0, SCARG(uap, osa) ? &osa : 0,
                  SCARG(uap, tramp), SCARG(uap, vers));
              if (error)
                      return (error);
              if (SCARG(uap, osa)) {
                      error = copyout(&osa, SCARG(uap, osa), sizeof(osa));
                      if (error)
                              return (error);
              }
              return 0;
      }
      
      /*
       * Manipulate signal mask.  Note that we receive new mask, not pointer, and
       * return old mask as return value; the library stub does the rest.
       */
      int
      sys___sigprocmask14(struct lwp *l, const struct sys___sigprocmask14_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(int)                        how;
                      syscallarg(const sigset_t *)        set;
                      syscallarg(sigset_t *)                oset;
              } */
              struct proc        *p = l->l_proc;
              sigset_t        nss, oss;
              int                error;
      
              if (SCARG(uap, set)) {
                      error = copyin(SCARG(uap, set), &nss, sizeof(nss));
                      if (error)
                              return error;
              }
              mutex_enter(p->p_lock);
              error = sigprocmask1(l, SCARG(uap, how),
                  SCARG(uap, set) ? &nss : 0, SCARG(uap, oset) ? &oss : 0);
              mutex_exit(p->p_lock);
              if (error)
                      return error;
              if (SCARG(uap, oset)) {
                      error = copyout(&oss, SCARG(uap, oset), sizeof(oss));
                      if (error)
                              return error;
              }
              return 0;
      }
      
      int
      sys___sigpending14(struct lwp *l, const struct sys___sigpending14_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(sigset_t *)        set;
              } */
              sigset_t ss;
      
              sigpending1(l, &ss);
              return copyout(&ss, SCARG(uap, set), sizeof(ss));
      }
      
      /*
       * Suspend process until signal, providing mask to be set in the meantime. 
       * Note nonstandard calling convention: libc stub passes mask, not pointer,
       * to save a copyin.
       */
      int
      sys___sigsuspend14(struct lwp *l, const struct sys___sigsuspend14_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(const sigset_t *)        set;
              } */
              sigset_t        ss;
              int                error;
      
              if (SCARG(uap, set)) {
                      error = copyin(SCARG(uap, set), &ss, sizeof(ss));
                      if (error)
                              return error;
              }
              return sigsuspend1(l, SCARG(uap, set) ? &ss : 0);
      }
      
      int
      sys___sigaltstack14(struct lwp *l, const struct sys___sigaltstack14_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(const struct sigaltstack *)        nss;
                      syscallarg(struct sigaltstack *)        oss;
              } */
              struct sigaltstack        nss, oss;
              int                        error;
      
              if (SCARG(uap, nss)) {
                      error = copyin(SCARG(uap, nss), &nss, sizeof(nss));
                      if (error)
                              return error;
              }
              error = sigaltstack1(l,
                  SCARG(uap, nss) ? &nss : 0, SCARG(uap, oss) ? &oss : 0);
              if (error)
                      return error;
              if (SCARG(uap, oss)) {
                      error = copyout(&oss, SCARG(uap, oss), sizeof(oss));
                      if (error)
                              return error;
              }
              return 0;
      }
      
      int
      kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval)
      {
              int error;
              struct proc *p;
      
              if ((u_int)ksi->ksi_signo >= NSIG)
                      return EINVAL;
      
              if (pid != l->l_proc->p_pid) {
                      if (ksi->ksi_pid != l->l_proc->p_pid)
                              return EPERM;
      
                      if (ksi->ksi_uid != kauth_cred_geteuid(l->l_cred))
                              return EPERM;
      
                      switch (ksi->ksi_code) {
                      case SI_USER:
                      case SI_QUEUE:
                              break;
                      default:
                              return EPERM;
                      }
              }
      
              if (pid > 0) {
                      /* kill single process */
                      mutex_enter(proc_lock);
                      p = proc_find_raw(pid);
                      if (p == NULL || (p->p_stat != SACTIVE && p->p_stat != SSTOP)) {
                              mutex_exit(proc_lock);
                              /* IEEE Std 1003.1-2001: return success for zombies */
                              return p ? 0 : ESRCH;
                      }
                      mutex_enter(p->p_lock);
                      error = kauth_authorize_process(l->l_cred,
                          KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(ksi->ksi_signo),
                          NULL, NULL);
                      if (!error && ksi->ksi_signo) {
                              error = kpsignal2(p, ksi);
                      }
                      mutex_exit(p->p_lock);
                      mutex_exit(proc_lock);
                      return error;
              }
      
              switch (pid) {
              case -1:                /* broadcast signal */
                      return killpg1(l, ksi, 0, 1);
              case 0:                        /* signal own process group */
                      return killpg1(l, ksi, 0, 0);
              default:                /* negative explicit process group */
                      return killpg1(l, ksi, -pid, 0);
              }
              /* NOTREACHED */
      }
      
      int
      sys_sigqueueinfo(struct lwp *l, const struct sys_sigqueueinfo_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(pid_t int)        pid;
                      syscallarg(const siginfo_t *)        info;
              } */
              ksiginfo_t        ksi;
              int error;
      
              KSI_INIT(&ksi);
      
              if ((error = copyin(&SCARG(uap, info)->_info, &ksi.ksi_info,
                  sizeof(ksi.ksi_info))) != 0)
                      return error;
      
              return kill1(l, SCARG(uap, pid), &ksi, retval);
      }
      
      int
      sys_kill(struct lwp *l, const struct sys_kill_args *uap, register_t *retval)
      {
              /* {
                      syscallarg(pid_t)        pid;
                      syscallarg(int)        signum;
              } */
              ksiginfo_t        ksi;
      
              KSI_INIT(&ksi);
      
              ksi.ksi_signo = SCARG(uap, signum);
              ksi.ksi_code = SI_USER;
              ksi.ksi_pid = l->l_proc->p_pid;
              ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);
      
              return kill1(l, SCARG(uap, pid), &ksi, retval);
      }
      
      int
      sys_getcontext(struct lwp *l, const struct sys_getcontext_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(struct __ucontext *) ucp;
              } */
              struct proc *p = l->l_proc;
              ucontext_t uc;
      
              memset(&uc, 0, sizeof(uc));
      
              mutex_enter(p->p_lock);
              getucontext(l, &uc);
              mutex_exit(p->p_lock);
      
              return copyout(&uc, SCARG(uap, ucp), sizeof (*SCARG(uap, ucp)));
      }
      
      int
      sys_setcontext(struct lwp *l, const struct sys_setcontext_args *uap,
          register_t *retval)
      {
              /* {
                      syscallarg(const ucontext_t *) ucp;
              } */
              struct proc *p = l->l_proc;
              ucontext_t uc;
              int error;
      
              error = copyin(SCARG(uap, ucp), &uc, sizeof (uc));
              if (error)
                      return error;
              if ((uc.uc_flags & _UC_CPU) == 0)
                      return EINVAL;
              mutex_enter(p->p_lock);
              error = setucontext(l, &uc);
              mutex_exit(p->p_lock);
              if (error)
                       return error;
      
              return EJUSTRETURN;
      }
      
      /*
       * sigtimedwait(2) system call, used also for implementation
       * of sigwaitinfo() and sigwait().
       *
       * This only handles single LWP in signal wait. libpthread provides
       * its own sigtimedwait() wrapper to DTRT WRT individual threads.
       */
      int
      sys_____sigtimedwait50(struct lwp *l,
          const struct sys_____sigtimedwait50_args *uap, register_t *retval)
      {
      
              return sigtimedwait1(l, uap, retval, copyin, copyout, copyin, copyout);
      }
      
      int
      sigaction1(struct lwp *l, int signum, const struct sigaction *nsa,
              struct sigaction *osa, const void *tramp, int vers)
      {
              struct proc *p;
              struct sigacts *ps;
              sigset_t tset;
              int prop, error;
              ksiginfoq_t kq;
              static bool v0v1valid;
      
              if (signum <= 0 || signum >= NSIG)
                      return EINVAL;
      
              p = l->l_proc;
              error = 0;
              ksiginfo_queue_init(&kq);
      
              /*
               * Trampoline ABI version 0 is reserved for the legacy kernel
               * provided on-stack trampoline.  Conversely, if we are using a
               * non-0 ABI version, we must have a trampoline.  Only validate the
               * vers if a new sigaction was supplied and there was an actual
               * handler specified (not SIG_IGN or SIG_DFL), which don't require
               * a trampoline. Emulations use legacy kernel trampolines with
               * version 0, alternatively check for that too.
               *
               * If version < 2, we try to autoload the compat module.  Note
               * that we interlock with the unload check in compat_modcmd()
               * using kernconfig_lock.  If the autoload fails, we don't try it
               * again for this process.
               */
              if (nsa != NULL && nsa->sa_handler != SIG_IGN
                  && nsa->sa_handler != SIG_DFL) {
                      if (__predict_false(vers < 2)) {
                              if (p->p_flag & PK_32)
                                      v0v1valid = true;
                              else if ((p->p_lflag & PL_SIGCOMPAT) == 0) {
                                      kernconfig_lock();
                                      if (sendsig_sigcontext_vec == NULL) {
                                              (void)module_autoload("compat",
                                                  MODULE_CLASS_ANY);
                                      }
                                      if (sendsig_sigcontext_vec != NULL) {
                                              /*
                                               * We need to remember if the
                                               * sigcontext method may be useable,
                                               * because libc may use it even
                                               * if siginfo is available.
                                               */
                                              v0v1valid = true;
                                      }
                                      mutex_enter(proc_lock);
                                      /*
                                       * Prevent unload of compat module while
                                       * this process remains.
                                       */
                                      p->p_lflag |= PL_SIGCOMPAT;
                                      mutex_exit(proc_lock);
                                      kernconfig_unlock();
                              }
                      }
      
                      switch (vers) {
                      case 0:
                              /* sigcontext, kernel supplied trampoline. */
                              if (tramp != NULL || !v0v1valid) {
                                      return EINVAL;
                              }
                              break;
                      case 1:
                              /* sigcontext, user supplied trampoline. */
                              if (tramp == NULL || !v0v1valid) {
                                      return EINVAL;
                              }
                              break;
                      case 2:
                      case 3:
                              /* siginfo, user supplied trampoline. */
                              if (tramp == NULL) {
                                      return EINVAL;
                              }
                              break;
                      default:
                              return EINVAL;
                      }
              }
      
              mutex_enter(p->p_lock);
      
              ps = p->p_sigacts;
              if (osa)
                      sigaction_copy(osa, &SIGACTION_PS(ps, signum));
              if (!nsa)
                      goto out;
      
              prop = sigprop[signum];
              if ((nsa->sa_flags & ~SA_ALLBITS) || (prop & SA_CANTMASK)) {
                      error = EINVAL;
                      goto out;
              }
      
              sigaction_copy(&SIGACTION_PS(ps, signum), nsa);
              ps->sa_sigdesc[signum].sd_tramp = tramp;
              ps->sa_sigdesc[signum].sd_vers = vers;
              sigminusset(&sigcantmask, &SIGACTION_PS(ps, signum).sa_mask);
      
              if ((prop & SA_NORESET) != 0)
                      SIGACTION_PS(ps, signum).sa_flags &= ~SA_RESETHAND;
      
              if (signum == SIGCHLD) {
                      if (nsa->sa_flags & SA_NOCLDSTOP)
                              p->p_sflag |= PS_NOCLDSTOP;
                      else
                              p->p_sflag &= ~PS_NOCLDSTOP;
                      if (nsa->sa_flags & SA_NOCLDWAIT) {
                              /*
                               * Paranoia: since SA_NOCLDWAIT is implemented by
                               * reparenting the dying child to PID 1 (and trust
                               * it to reap the zombie), PID 1 itself is forbidden
                               * to set SA_NOCLDWAIT.
                               */
                              if (p->p_pid == 1)
                                      p->p_flag &= ~PK_NOCLDWAIT;
                              else
                                      p->p_flag |= PK_NOCLDWAIT;
                      } else
                              p->p_flag &= ~PK_NOCLDWAIT;
      
                      if (nsa->sa_handler == SIG_IGN) {
                              /*
                               * Paranoia: same as above.
                               */
                              if (p->p_pid == 1)
                                      p->p_flag &= ~PK_CLDSIGIGN;
                              else
                                      p->p_flag |= PK_CLDSIGIGN;
                      } else
                              p->p_flag &= ~PK_CLDSIGIGN;
              }
      
              if ((nsa->sa_flags & SA_NODEFER) == 0)
                      sigaddset(&SIGACTION_PS(ps, signum).sa_mask, signum);
              else
                      sigdelset(&SIGACTION_PS(ps, signum).sa_mask, signum);
      
              /*
               * Set bit in p_sigctx.ps_sigignore for signals that are set to
               * SIG_IGN, and for signals set to SIG_DFL where the default is to
               * ignore. However, don't put SIGCONT in p_sigctx.ps_sigignore, as
               * we have to restart the process.
               */
              if (nsa->sa_handler == SIG_IGN ||
                  (nsa->sa_handler == SIG_DFL && (prop & SA_IGNORE) != 0)) {
                      /* Never to be seen again. */
                      sigemptyset(&tset);
                      sigaddset(&tset, signum);
                      sigclearall(p, &tset, &kq);
                      if (signum != SIGCONT) {
                              /* Easier in psignal */
                              sigaddset(&p->p_sigctx.ps_sigignore, signum);
                      }
                      sigdelset(&p->p_sigctx.ps_sigcatch, signum);
              } else {
                      sigdelset(&p->p_sigctx.ps_sigignore, signum);
                      if (nsa->sa_handler == SIG_DFL)
                              sigdelset(&p->p_sigctx.ps_sigcatch, signum);
                      else
                              sigaddset(&p->p_sigctx.ps_sigcatch, signum);
              }
      
              /*
               * Previously held signals may now have become visible.  Ensure that
               * we check for them before returning to userspace.
               */
              if (sigispending(l, 0)) {
                      lwp_lock(l);
                      l->l_flag |= LW_PENDSIG;
                      lwp_unlock(l);
              }
      out:
              mutex_exit(p->p_lock);
              ksiginfo_queue_drain(&kq);
      
              return error;
      }
      
      int
      sigprocmask1(struct lwp *l, int how, const sigset_t *nss, sigset_t *oss)
      {
              sigset_t *mask = &l->l_sigmask;
              bool more;
      
              KASSERT(mutex_owned(l->l_proc->p_lock));
      
              if (oss) {
                      *oss = *mask;
              }
      
              if (nss == NULL) {
                      return 0;
              }
      
              switch (how) {
              case SIG_BLOCK:
                      sigplusset(nss, mask);
                      more = false;
                      break;
              case SIG_UNBLOCK:
                      sigminusset(nss, mask);
                      more = true;
                      break;
              case SIG_SETMASK:
                      *mask = *nss;
                      more = true;
                      break;
              default:
                      return EINVAL;
              }
              sigminusset(&sigcantmask, mask);
              if (more && sigispending(l, 0)) {
                      /*
                       * Check for pending signals on return to user.
                       */
                      lwp_lock(l);
                      l->l_flag |= LW_PENDSIG;
                      lwp_unlock(l);
              }
              return 0;
      }
      
      void
      sigpending1(struct lwp *l, sigset_t *ss)
      {
              struct proc *p = l->l_proc;
      
              mutex_enter(p->p_lock);
              *ss = l->l_sigpend.sp_set;
              sigplusset(&p->p_sigpend.sp_set, ss);
              mutex_exit(p->p_lock);
      }
      
      void
      sigsuspendsetup(struct lwp *l, const sigset_t *ss)
      {
    2         struct proc *p = l->l_proc;
      
              /*
               * When returning from sigsuspend/pselect/pollts, we want
               * the old mask to be restored after the
               * signal handler has finished.  Thus, we
               * save it here and mark the sigctx structure
               * to indicate this.
               */
              mutex_enter(p->p_lock);
              l->l_sigrestore = 1;
              l->l_sigoldmask = l->l_sigmask;
              l->l_sigmask = *ss;
              sigminusset(&sigcantmask, &l->l_sigmask);
      
              /* Check for pending signals when sleeping. */
              if (sigispending(l, 0)) {
                      lwp_lock(l);
                      l->l_flag |= LW_PENDSIG;
                      lwp_unlock(l);
              }
    2         mutex_exit(p->p_lock);
      }
      
      void
      sigsuspendteardown(struct lwp *l)
      {
    2         struct proc *p = l->l_proc;
      
              mutex_enter(p->p_lock);
              /* Check for pending signals when sleeping. */
              if (l->l_sigrestore) {
    2                 if (sigispending(l, 0)) {
                              lwp_lock(l);
                              l->l_flag |= LW_PENDSIG;
                              lwp_unlock(l);
                      } else {
    2                         l->l_sigrestore = 0;
                              l->l_sigmask = l->l_sigoldmask;
                      }
              }
    2         mutex_exit(p->p_lock);
      }
      
      int
      sigsuspend1(struct lwp *l, const sigset_t *ss)
      {
      
              if (ss)
                      sigsuspendsetup(l, ss);
      
              while (kpause("pause", true, 0, NULL) == 0)
                      ;
      
              /* always return EINTR rather than ERESTART... */
              return EINTR;
      }
      
      int
      sigaltstack1(struct lwp *l, const struct sigaltstack *nss,
          struct sigaltstack *oss)
      {
              struct proc *p = l->l_proc;
              int error = 0;
      
              mutex_enter(p->p_lock);
      
              if (oss)
                      *oss = l->l_sigstk;
      
              if (nss) {
                      if (nss->ss_flags & ~SS_ALLBITS)
                              error = EINVAL;
                      else if (nss->ss_flags & SS_DISABLE) {
                              if (l->l_sigstk.ss_flags & SS_ONSTACK)
                                      error = EINVAL;
                      } else if (nss->ss_size < MINSIGSTKSZ)
                              error = ENOMEM;
      
                      if (!error)
                              l->l_sigstk = *nss;
              }
      
              mutex_exit(p->p_lock);
      
              return error;
      }
      
      int
      sigtimedwait1(struct lwp *l, const struct sys_____sigtimedwait50_args *uap,
          register_t *retval, copyin_t fetchss, copyout_t storeinf, copyin_t fetchts,
          copyout_t storets)
      {
              /* {
                      syscallarg(const sigset_t *) set;
                      syscallarg(siginfo_t *) info;
                      syscallarg(struct timespec *) timeout;
              } */
              struct proc *p = l->l_proc;
              int error, signum, timo;
              struct timespec ts, tsstart, tsnow;
              ksiginfo_t ksi;
      
              /*
               * Calculate timeout, if it was specified.
               *
               * NULL pointer means an infinite timeout.
               * {.tv_sec = 0, .tv_nsec = 0} means do not block.
               */
              if (SCARG(uap, timeout)) {
                      error = (*fetchts)(SCARG(uap, timeout), &ts, sizeof(ts));
                      if (error)
                              return error;
      
                      if ((error = itimespecfix(&ts)) != 0)
                              return error;
      
                      timo = tstohz(&ts);
                      if (timo == 0) {
                              if (ts.tv_sec == 0 && ts.tv_nsec == 0)
                                      timo = -1; /* do not block */
                              else
                                      timo = 1; /* the shortest possible timeout */
                      }
      
                      /*
                       * Remember current uptime, it would be used in
                       * ECANCELED/ERESTART case.
                       */
                      getnanouptime(&tsstart);
              } else {
                      memset(&tsstart, 0, sizeof(tsstart)); /* XXXgcc */
                      timo = 0; /* infinite timeout */
              }
      
              error = (*fetchss)(SCARG(uap, set), &l->l_sigwaitset,
                  sizeof(l->l_sigwaitset));
              if (error)
                      return error;
      
              /*
               * Silently ignore SA_CANTMASK signals. psignal1() would ignore
               * SA_CANTMASK signals in waitset, we do this only for the below
               * siglist check.
               */
              sigminusset(&sigcantmask, &l->l_sigwaitset);
      
              memset(&ksi.ksi_info, 0, sizeof(ksi.ksi_info));
      
              mutex_enter(p->p_lock);
      
              /* Check for pending signals in the process, if no - then in LWP. */
              if ((signum = sigget(&p->p_sigpend, &ksi, 0, &l->l_sigwaitset)) == 0)
                      signum = sigget(&l->l_sigpend, &ksi, 0, &l->l_sigwaitset);
      
              if (signum != 0) {
                      /* If found a pending signal, just copy it out to the user. */
                      mutex_exit(p->p_lock);
                      goto out;
              }
      
              if (timo < 0) {
                      /* If not allowed to block, return an error */
                      mutex_exit(p->p_lock);
                      return EAGAIN;
              }
      
              /*
               * Set up the sigwait list and wait for signal to arrive.
               * We can either be woken up or time out.
               */
              l->l_sigwaited = &ksi;
              LIST_INSERT_HEAD(&p->p_sigwaiters, l, l_sigwaiter);
              error = cv_timedwait_sig(&l->l_sigcv, p->p_lock, timo);
      
              /*
               * Need to find out if we woke as a result of _lwp_wakeup() or a
               * signal outside our wait set.
               */
              if (l->l_sigwaited != NULL) {
                      if (error == EINTR) {
                              /* Wakeup via _lwp_wakeup(). */
                              error = ECANCELED;
                      } else if (!error) {
                              /* Spurious wakeup - arrange for syscall restart. */
                              error = ERESTART;
                      }
                      l->l_sigwaited = NULL;
                      LIST_REMOVE(l, l_sigwaiter);
              }
              mutex_exit(p->p_lock);
      
              /*
               * If the sleep was interrupted (either by signal or wakeup), update
               * the timeout and copyout new value back.  It would be used when
               * the syscall would be restarted or called again.
               */
              if (timo && (error == ERESTART || error == ECANCELED)) {
                      getnanouptime(&tsnow);
      
                      /* Compute how much time has passed since start. */
                      timespecsub(&tsnow, &tsstart, &tsnow);
      
                      /* Substract passed time from timeout. */
                      timespecsub(&ts, &tsnow, &ts);
      
                      if (ts.tv_sec < 0)
                              error = EAGAIN;
                      else {
                              /* Copy updated timeout to userland. */
                              error = (*storets)(&ts, SCARG(uap, timeout),
                                  sizeof(ts));
                      }
              }
      out:
              /*
               * If a signal from the wait set arrived, copy it to userland.
               * Copy only the used part of siginfo, the padding part is
               * left unchanged (userland is not supposed to touch it anyway).
               */
              if (error == 0 && SCARG(uap, info)) {
                      error = (*storeinf)(&ksi.ksi_info, SCARG(uap, info),
                          sizeof(ksi.ksi_info));
              }
              if (error == 0) {
                      *retval = ksi.ksi_info._signo;
                      SDT_PROBE(proc, kernel, , signal__clear, *retval,
                          &ksi, 0, 0, 0);
              }
              return error;
      }
      /*        $NetBSD: time.h,v 1.79 2017/01/17 15:28:34 maya Exp $        */
      
      /*
       * Copyright (c) 1982, 1986, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)time.h        8.5 (Berkeley) 5/4/95
       */
      
      #ifndef _SYS_TIME_H_
      #define        _SYS_TIME_H_
      
      #include <sys/featuretest.h>
      #include <sys/types.h>
      
      /*
       * Structure returned by gettimeofday(2) system call,
       * and used in other calls.
       */
      struct timeval {
              time_t            tv_sec;                /* seconds */
              suseconds_t        tv_usec;        /* and microseconds */
      };
      
      #include <sys/timespec.h>
      
      #if defined(_NETBSD_SOURCE)
      #define        TIMEVAL_TO_TIMESPEC(tv, ts) do {                                \
              (ts)->tv_sec = (tv)->tv_sec;                                        \
              (ts)->tv_nsec = (tv)->tv_usec * 1000;                                \
      } while (/*CONSTCOND*/0)
      #define        TIMESPEC_TO_TIMEVAL(tv, ts) do {                                \
              (tv)->tv_sec = (ts)->tv_sec;                                        \
              (tv)->tv_usec = (suseconds_t)(ts)->tv_nsec / 1000;                \
      } while (/*CONSTCOND*/0)
      
      /*
       * Note: timezone is obsolete. All timezone handling is now in
       * userland. Its just here for back compatibility.
       */
      struct timezone {
              int        tz_minuteswest;        /* minutes west of Greenwich */
              int        tz_dsttime;        /* type of dst correction */
      };
      
      /* Operations on timevals. */
      #define        timerclear(tvp)                (tvp)->tv_sec = (tvp)->tv_usec = 0L
      #define        timerisset(tvp)                ((tvp)->tv_sec || (tvp)->tv_usec)
      #define        timercmp(tvp, uvp, cmp)                                                \
              (((tvp)->tv_sec == (uvp)->tv_sec) ?                                \
                  ((tvp)->tv_usec cmp (uvp)->tv_usec) :                        \
                  ((tvp)->tv_sec cmp (uvp)->tv_sec))
      #define        timeradd(tvp, uvp, vvp)                                                \
              do {                                                                \
                      (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec;                \
                      (vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec;        \
                      if ((vvp)->tv_usec >= 1000000) {                        \
                              (vvp)->tv_sec++;                                \
                              (vvp)->tv_usec -= 1000000;                        \
                      }                                                        \
              } while (/* CONSTCOND */ 0)
      #define        timersub(tvp, uvp, vvp)                                                \
              do {                                                                \
                      (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;                \
                      (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;        \
                      if ((vvp)->tv_usec < 0) {                                \
                              (vvp)->tv_sec--;                                \
                              (vvp)->tv_usec += 1000000;                        \
                      }                                                        \
              } while (/* CONSTCOND */ 0)
      
      /*
       * hide bintime for _STANDALONE because this header is used for hpcboot.exe,
       * which is built with compilers which don't recognize LL suffix.
       *        http://mail-index.NetBSD.org/tech-userlevel/2008/02/27/msg000181.html
       */
      #if !defined(_STANDALONE)
      struct bintime {
              time_t        sec;
              uint64_t frac;
      };
      
      static __inline void
      bintime_addx(struct bintime *bt, uint64_t x)
      {
              uint64_t u;
      
              u = bt->frac;
              bt->frac += x;
              if (u > bt->frac)
   20                 bt->sec++;
      }
      
      static __inline void
      bintime_add(struct bintime *bt, const struct bintime *bt2)
      {
              uint64_t u;
      
  673         u = bt->frac;
              bt->frac += bt2->frac;
    1         if (u > bt->frac)
  185                 bt->sec++;
  702         bt->sec += bt2->sec;
      }
      
      static __inline void
      bintime_sub(struct bintime *bt, const struct bintime *bt2)
      {
              uint64_t u;
      
              u = bt->frac;
              bt->frac -= bt2->frac;
    2         if (u < bt->frac)
   36                 bt->sec--;
  679         bt->sec -= bt2->sec;
      }
      
      #define        bintimecmp(bta, btb, cmp)                                        \
              (((bta)->sec == (btb)->sec) ?                                        \
                  ((bta)->frac cmp (btb)->frac) :                                \
                  ((bta)->sec cmp (btb)->sec))
      
      /*-
       * Background information:
       *
       * When converting between timestamps on parallel timescales of differing
       * resolutions it is historical and scientific practice to round down rather
       * than doing 4/5 rounding.
       *
       *   The date changes at midnight, not at noon.
       *
       *   Even at 15:59:59.999999999 it's not four'o'clock.
       *
       *   time_second ticks after N.999999999 not after N.4999999999
       */
      
      /*
       * The magic numbers for converting ms/us/ns to fractions
       */
      
      /* 1ms = (2^64) / 1000       */
      #define        BINTIME_SCALE_MS        ((uint64_t)18446744073709551ULL)
      
      /* 1us = (2^64) / 1000000    */
      #define        BINTIME_SCALE_US        ((uint64_t)18446744073709ULL)
      
      /* 1ns = (2^64) / 1000000000 */
      #define        BINTIME_SCALE_NS        ((uint64_t)18446744073ULL)
      
      static __inline void
      bintime2timespec(const struct bintime *bt, struct timespec *ts)
      {
      
              ts->tv_sec = bt->sec;
              ts->tv_nsec =
                  (long)((1000000000ULL * (uint32_t)(bt->frac >> 32)) >> 32);
      }
      
      static __inline void
      timespec2bintime(const struct timespec *ts, struct bintime *bt)
      {
      
              bt->sec = ts->tv_sec;
              bt->frac = (uint64_t)ts->tv_nsec * BINTIME_SCALE_NS;
      }
      
      static __inline void
      bintime2timeval(const struct bintime *bt, struct timeval *tv)
      {
      
    8         tv->tv_sec = bt->sec;
              tv->tv_usec =
                  (suseconds_t)((1000000ULL * (uint32_t)(bt->frac >> 32)) >> 32);
      }
      
      static __inline void
      timeval2bintime(const struct timeval *tv, struct bintime *bt)
      {
      
              bt->sec = tv->tv_sec;
              bt->frac = (uint64_t)tv->tv_usec * BINTIME_SCALE_US;
      }
      
      static __inline struct bintime
      ms2bintime(uint64_t ms)
      {
              struct bintime bt;
      
              bt.sec = (time_t)(ms / 1000U);
              bt.frac = (uint64_t)(ms % 1000U) * BINTIME_SCALE_MS;
      
              return bt;
      }
      
      static __inline struct bintime
      us2bintime(uint64_t us)
      {
              struct bintime bt;
      
              bt.sec = (time_t)(us / 1000000U);
              bt.frac = (uint64_t)(us % 1000000U) * BINTIME_SCALE_US;
      
              return bt;
      }
      
      static __inline struct bintime
      ns2bintime(uint64_t ns)
      {
              struct bintime bt;
      
              bt.sec = (time_t)(ns / 1000000000U);
              bt.frac = (uint64_t)(ns % 1000000000U) * BINTIME_SCALE_NS;
      
              return bt;
      }
      #endif /* !defined(_STANDALONE) */
      
      /* Operations on timespecs. */
      #define        timespecclear(tsp)        (tsp)->tv_sec = (time_t)((tsp)->tv_nsec = 0L)
      #define        timespecisset(tsp)        ((tsp)->tv_sec || (tsp)->tv_nsec)
      #define        timespeccmp(tsp, usp, cmp)                                        \
              (((tsp)->tv_sec == (usp)->tv_sec) ?                                \
                  ((tsp)->tv_nsec cmp (usp)->tv_nsec) :                        \
                  ((tsp)->tv_sec cmp (usp)->tv_sec))
      #define        timespecadd(tsp, usp, vsp)                                        \
              do {                                                                \
                      (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec;                \
                      (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec;        \
                      if ((vsp)->tv_nsec >= 1000000000L) {                        \
                              (vsp)->tv_sec++;                                \
                              (vsp)->tv_nsec -= 1000000000L;                        \
                      }                                                        \
              } while (/* CONSTCOND */ 0)
      #define        timespecsub(tsp, usp, vsp)                                        \
              do {                                                                \
                      (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec;                \
                      (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec;        \
                      if ((vsp)->tv_nsec < 0) {                                \
                              (vsp)->tv_sec--;                                \
                              (vsp)->tv_nsec += 1000000000L;                        \
                      }                                                        \
              } while (/* CONSTCOND */ 0)
      #define timespec2ns(x) (((uint64_t)(x)->tv_sec) * 1000000000L + (x)->tv_nsec)
      #endif /* _NETBSD_SOURCE */
      
      /*
       * Names of the interval timers, and structure
       * defining a timer setting.
       * NB: Must match the CLOCK_ constants below.
       */
      #define        ITIMER_REAL                0
      #define        ITIMER_VIRTUAL                1
      #define        ITIMER_PROF                2
      #define        ITIMER_MONOTONIC        3
      
      struct        itimerval {
              struct        timeval it_interval;        /* timer interval */
              struct        timeval it_value;        /* current value */
      };
      
      /*
       * Structure defined by POSIX.1b to be like a itimerval, but with
       * timespecs. Used in the timer_*() system calls.
       */
      struct        itimerspec {
              struct        timespec it_interval;
              struct        timespec it_value;
      };
      
      #define        CLOCK_REALTIME        0
      #define        CLOCK_VIRTUAL        1
      #define        CLOCK_PROF        2
      #define        CLOCK_MONOTONIC        3
      #define CLOCK_THREAD_CPUTIME_ID                0x20000000
      #define CLOCK_PROCESS_CPUTIME_ID        0x40000000
      
      #if defined(_NETBSD_SOURCE)
      #define        TIMER_RELTIME        0x0        /* relative timer */
      #endif
      #define        TIMER_ABSTIME        0x1        /* absolute timer */
      
      #ifdef _KERNEL
      #include <sys/timevar.h>
      #else /* !_KERNEL */
      #ifndef _STANDALONE
      #if (_POSIX_C_SOURCE - 0) >= 200112L || \
          (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) || \
          (_XOPEN_SOURCE - 0) >= 500 || defined(_NETBSD_SOURCE)
      #include <sys/select.h>
      #endif
      
      #include <sys/cdefs.h>
      #include <time.h>
      
      __BEGIN_DECLS
      #ifndef __LIBC12_SOURCE__
      #if (_POSIX_C_SOURCE - 0) >= 200112L || \
          defined(_XOPEN_SOURCE) || defined(_NETBSD_SOURCE)
      int        getitimer(int, struct itimerval *) __RENAME(__getitimer50);
      int        gettimeofday(struct timeval * __restrict, void *__restrict)
          __RENAME(__gettimeofday50);
      int        setitimer(int, const struct itimerval * __restrict,
                  struct itimerval * __restrict) __RENAME(__setitimer50);
      int        utimes(const char *, const struct timeval [2]) __RENAME(__utimes50);
      #endif /* _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE || _NETBSD_SOURCE */
      
      #if defined(_NETBSD_SOURCE) || defined(HAVE_NBTOOL_CONFIG_H)
      int        adjtime(const struct timeval *, struct timeval *) __RENAME(__adjtime50);
      int        futimes(int, const struct timeval [2]) __RENAME(__futimes50);
      int        lutimes(const char *, const struct timeval [2]) __RENAME(__lutimes50);
      int        settimeofday(const struct timeval * __restrict,
                  const void *__restrict) __RENAME(__settimeofday50);
      #endif /* _NETBSD_SOURCE */
      #endif /* __LIBC12_SOURCE__ */
      __END_DECLS
      
      #endif        /* !_STANDALONE */
      #endif /* !_KERNEL */
      #endif /* !_SYS_TIME_H_ */
      /*        $NetBSD: subr_pool.c,v 1.259 2019/09/23 05:39:59 skrll Exp $        */
      
      /*
       * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018
       *     The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
       * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by
       * Maxime Villard.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.259 2019/09/23 05:39:59 skrll Exp $");
      
      #ifdef _KERNEL_OPT
      #include "opt_ddb.h"
      #include "opt_lockdebug.h"
      #include "opt_pool.h"
      #include "opt_kleak.h"
      #endif
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/sysctl.h>
      #include <sys/bitops.h>
      #include <sys/proc.h>
      #include <sys/errno.h>
      #include <sys/kernel.h>
      #include <sys/vmem.h>
      #include <sys/pool.h>
      #include <sys/syslog.h>
      #include <sys/debug.h>
      #include <sys/lockdebug.h>
      #include <sys/xcall.h>
      #include <sys/cpu.h>
      #include <sys/atomic.h>
      #include <sys/asan.h>
      
      #include <uvm/uvm_extern.h>
      
      /*
       * Pool resource management utility.
       *
       * Memory is allocated in pages which are split into pieces according to
       * the pool item size. Each page is kept on one of three lists in the
       * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
       * for empty, full and partially-full pages respectively. The individual
       * pool items are on a linked list headed by `ph_itemlist' in each page
       * header. The memory for building the page list is either taken from
       * the allocated pages themselves (for small pool items) or taken from
       * an internal pool of page headers (`phpool').
       */
      
      /* List of all pools. Non static as needed by 'vmstat -m' */
      TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
      
      /* Private pool for page header structures */
      #define        PHPOOL_MAX        8
      static struct pool phpool[PHPOOL_MAX];
      #define        PHPOOL_FREELIST_NELEM(idx) \
              (((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx)))
      
      #if defined(DIAGNOSTIC) || defined(KASAN)
      #define POOL_REDZONE
      #endif
      
      #ifdef POOL_REDZONE
      # ifdef KASAN
      #  define POOL_REDZONE_SIZE 8
      # else
      #  define POOL_REDZONE_SIZE 2
      # endif
      static void pool_redzone_init(struct pool *, size_t);
      static void pool_redzone_fill(struct pool *, void *);
      static void pool_redzone_check(struct pool *, void *);
      static void pool_cache_redzone_check(pool_cache_t, void *);
      #else
      # define pool_redzone_init(pp, sz)                __nothing
      # define pool_redzone_fill(pp, ptr)                __nothing
      # define pool_redzone_check(pp, ptr)                __nothing
      # define pool_cache_redzone_check(pc, ptr)        __nothing
      #endif
      
      #ifdef KLEAK
      static void pool_kleak_fill(struct pool *, void *);
      static void pool_cache_kleak_fill(pool_cache_t, void *);
      #else
      #define pool_kleak_fill(pp, ptr)        __nothing
      #define pool_cache_kleak_fill(pc, ptr)        __nothing
      #endif
      
      #ifdef POOL_QUARANTINE
      static void pool_quarantine_init(struct pool *);
      static void pool_quarantine_flush(struct pool *);
      static bool pool_put_quarantine(struct pool *, void *,
          struct pool_pagelist *);
      static bool pool_cache_put_quarantine(pool_cache_t, void *, paddr_t);
      #else
      #define pool_quarantine_init(a)                        __nothing
      #define pool_quarantine_flush(a)                __nothing
      #define pool_put_quarantine(a, b, c)                false
      #define pool_cache_put_quarantine(a, b, c)        false
      #endif
      
      #define pc_has_ctor(pc) \
              (pc->pc_ctor != (int (*)(void *, void *, int))nullop)
      #define pc_has_dtor(pc) \
              (pc->pc_dtor != (void (*)(void *, void *))nullop)
      
      /*
       * Pool backend allocators.
       *
       * Each pool has a backend allocator that handles allocation, deallocation,
       * and any additional draining that might be needed.
       *
       * We provide two standard allocators:
       *
       *        pool_allocator_kmem - the default when no allocator is specified
       *
       *        pool_allocator_nointr - used for pools that will not be accessed
       *        in interrupt context.
       */
      void *pool_page_alloc(struct pool *, int);
      void pool_page_free(struct pool *, void *);
      
      static void *pool_page_alloc_meta(struct pool *, int);
      static void pool_page_free_meta(struct pool *, void *);
      
      struct pool_allocator pool_allocator_kmem = {
              .pa_alloc = pool_page_alloc,
              .pa_free = pool_page_free,
              .pa_pagesz = 0
      };
      
      struct pool_allocator pool_allocator_nointr = {
              .pa_alloc = pool_page_alloc,
              .pa_free = pool_page_free,
              .pa_pagesz = 0
      };
      
      struct pool_allocator pool_allocator_meta = {
              .pa_alloc = pool_page_alloc_meta,
              .pa_free = pool_page_free_meta,
              .pa_pagesz = 0
      };
      
      #define POOL_ALLOCATOR_BIG_BASE 13
      static struct pool_allocator pool_allocator_big[] = {
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0),
              },
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1),
              },
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2),
              },
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3),
              },
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4),
              },
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5),
              },
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6),
              },
              {
                      .pa_alloc = pool_page_alloc,
                      .pa_free = pool_page_free,
                      .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7),
              }
      };
      
      static int pool_bigidx(size_t);
      
      /* # of seconds to retain page after last use */
      int pool_inactive_time = 10;
      
      /* Next candidate for drainage (see pool_drain()) */
      static struct pool *drainpp;
      
      /* This lock protects both pool_head and drainpp. */
      static kmutex_t pool_head_lock;
      static kcondvar_t pool_busy;
      
      /* This lock protects initialization of a potentially shared pool allocator */
      static kmutex_t pool_allocator_lock;
      
      static unsigned int poolid_counter = 0;
      
      typedef uint32_t pool_item_bitmap_t;
      #define        BITMAP_SIZE        (CHAR_BIT * sizeof(pool_item_bitmap_t))
      #define        BITMAP_MASK        (BITMAP_SIZE - 1)
      #define        BITMAP_MIN_SIZE        (CHAR_BIT * sizeof(((struct pool_item_header *)NULL)->ph_u2))
      
      struct pool_item_header {
              /* Page headers */
              LIST_ENTRY(pool_item_header)
                                      ph_pagelist;        /* pool page list */
              union {
                      /* !PR_PHINPAGE */
                      struct {
                              SPLAY_ENTRY(pool_item_header)
                                      phu_node;        /* off-page page headers */
                      } phu_offpage;
                      /* PR_PHINPAGE */
                      struct {
                              unsigned int phu_poolid;
                      } phu_onpage;
              } ph_u1;
              void *                        ph_page;        /* this page's address */
              uint32_t                ph_time;        /* last referenced */
              uint16_t                ph_nmissing;        /* # of chunks in use */
              uint16_t                ph_off;                /* start offset in page */
              union {
                      /* !PR_USEBMAP */
                      struct {
                              LIST_HEAD(, pool_item)
                                      phu_itemlist;        /* chunk list for this page */
                      } phu_normal;
                      /* PR_USEBMAP */
                      struct {
                              pool_item_bitmap_t phu_bitmap[1];
                      } phu_notouch;
              } ph_u2;
      };
      #define ph_node                ph_u1.phu_offpage.phu_node
      #define ph_poolid        ph_u1.phu_onpage.phu_poolid
      #define ph_itemlist        ph_u2.phu_normal.phu_itemlist
      #define ph_bitmap        ph_u2.phu_notouch.phu_bitmap
      
      #define PHSIZE        ALIGN(sizeof(struct pool_item_header))
      
      CTASSERT(offsetof(struct pool_item_header, ph_u2) +
          BITMAP_MIN_SIZE / CHAR_BIT == sizeof(struct pool_item_header));
      
      #if defined(DIAGNOSTIC) && !defined(KASAN)
      #define POOL_CHECK_MAGIC
      #endif
      
      struct pool_item {
      #ifdef POOL_CHECK_MAGIC
              u_int pi_magic;
      #endif
      #define        PI_MAGIC 0xdeaddeadU
              /* Other entries use only this list entry */
              LIST_ENTRY(pool_item)        pi_list;
      };
      
      #define        POOL_NEEDS_CATCHUP(pp)                                                \
              ((pp)->pr_nitems < (pp)->pr_minitems)
      #define        POOL_OBJ_TO_PAGE(pp, v)                                                \
              (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask)
      
      /*
       * Pool cache management.
       *
       * Pool caches provide a way for constructed objects to be cached by the
       * pool subsystem.  This can lead to performance improvements by avoiding
       * needless object construction/destruction; it is deferred until absolutely
       * necessary.
       *
       * Caches are grouped into cache groups.  Each cache group references up
       * to PCG_NUMOBJECTS constructed objects.  When a cache allocates an
       * object from the pool, it calls the object's constructor and places it
       * into a cache group.  When a cache group frees an object back to the
       * pool, it first calls the object's destructor.  This allows the object
       * to persist in constructed form while freed to the cache.
       *
       * The pool references each cache, so that when a pool is drained by the
       * pagedaemon, it can drain each individual cache as well.  Each time a
       * cache is drained, the most idle cache group is freed to the pool in
       * its entirety.
       *
       * Pool caches are layed on top of pools.  By layering them, we can avoid
       * the complexity of cache management for pools which would not benefit
       * from it.
       */
      
      static struct pool pcg_normal_pool;
      static struct pool pcg_large_pool;
      static struct pool cache_pool;
      static struct pool cache_cpu_pool;
      
      /* List of all caches. */
      TAILQ_HEAD(,pool_cache) pool_cache_head =
          TAILQ_HEAD_INITIALIZER(pool_cache_head);
      
      int pool_cache_disable;                /* global disable for caching */
      static const pcg_t pcg_dummy;        /* zero sized: always empty, yet always full */
      
      static bool        pool_cache_put_slow(pool_cache_cpu_t *, int,
                                          void *);
      static bool        pool_cache_get_slow(pool_cache_cpu_t *, int,
                                          void **, paddr_t *, int);
      static void        pool_cache_cpu_init1(struct cpu_info *, pool_cache_t);
      static void        pool_cache_invalidate_groups(pool_cache_t, pcg_t *);
      static void        pool_cache_invalidate_cpu(pool_cache_t, u_int);
      static void        pool_cache_transfer(pool_cache_t);
      
      static int        pool_catchup(struct pool *);
      static void        pool_prime_page(struct pool *, void *,
                          struct pool_item_header *);
      static void        pool_update_curpage(struct pool *);
      
      static int        pool_grow(struct pool *, int);
      static void        *pool_allocator_alloc(struct pool *, int);
      static void        pool_allocator_free(struct pool *, void *);
      
      static void pool_print_pagelist(struct pool *, struct pool_pagelist *,
              void (*)(const char *, ...) __printflike(1, 2));
      static void pool_print1(struct pool *, const char *,
              void (*)(const char *, ...) __printflike(1, 2));
      
      static int pool_chk_page(struct pool *, const char *,
                               struct pool_item_header *);
      
      /* -------------------------------------------------------------------------- */
      
      static inline unsigned int
      pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph,
          const void *v)
      {
              const char *cp = v;
              unsigned int idx;
      
              KASSERT(pp->pr_roflags & PR_USEBMAP);
  823         idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size;
      
              if (__predict_false(idx >= pp->pr_itemsperpage)) {
                      panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx,
                          pp->pr_itemsperpage);
              }
      
              return idx;
      }
      
      static inline void
      pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph,
          void *obj)
      {
  823         unsigned int idx = pr_item_bitmap_index(pp, ph, obj);
  823         pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE);
              pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);
      
              if (__predict_false((*bitmap & mask) != 0)) {
                      panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj);
              }
      
  823         *bitmap |= mask;
      }
      
      static inline void *
      pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph)
      {
 1029         pool_item_bitmap_t *bitmap = ph->ph_bitmap;
              unsigned int idx;
              int i;
      
  653         for (i = 0; ; i++) {
                      int bit;
      
 1029                 KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage);
 1029                 bit = ffs32(bitmap[i]);
                      if (bit) {
                              pool_item_bitmap_t mask;
      
 1029                         bit--;
 1029                         idx = (i * BITMAP_SIZE) + bit;
                              mask = 1U << bit;
                              KASSERT((bitmap[i] & mask) != 0);
                              bitmap[i] &= ~mask;
                              break;
                      }
              }
              KASSERT(idx < pp->pr_itemsperpage);
 1029         return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size;
      }
      
      static inline void
      pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph)
      {
  104         pool_item_bitmap_t *bitmap = ph->ph_bitmap;
              const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE);
              int i;
      
              for (i = 0; i < n; i++) {
  104                 bitmap[i] = (pool_item_bitmap_t)-1;
              }
      }
      
      /* -------------------------------------------------------------------------- */
      
      static inline void
      pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph,
          void *obj)
      {
              struct pool_item *pi = obj;
      
      #ifdef POOL_CHECK_MAGIC
              pi->pi_magic = PI_MAGIC;
      #endif
      
   75         if (pp->pr_redzone) {
                      /*
                       * Mark the pool_item as valid. The rest is already
                       * invalid.
                       */
   75                 kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0);
              }
      
   75         LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
      }
      
      static inline void *
      pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph)
      {
              struct pool_item *pi;
              void *v;
      
              v = pi = LIST_FIRST(&ph->ph_itemlist);
              if (__predict_false(v == NULL)) {
                      mutex_exit(&pp->pr_lock);
                      panic("%s: [%s] page empty", __func__, pp->pr_wchan);
              }
  294         KASSERTMSG((pp->pr_nitems > 0),
                  "%s: [%s] nitems %u inconsistent on itemlist",
                  __func__, pp->pr_wchan, pp->pr_nitems);
      #ifdef POOL_CHECK_MAGIC
              KASSERTMSG((pi->pi_magic == PI_MAGIC),
                  "%s: [%s] free list modified: "
                  "magic=%x; page %p; item addr %p", __func__,
                  pp->pr_wchan, pi->pi_magic, ph->ph_page, pi);
      #endif
      
              /*
               * Remove from item list.
               */
  294         LIST_REMOVE(pi, pi_list);
      
              return v;
      }
      
      /* -------------------------------------------------------------------------- */
      
      static inline void
      pr_phinpage_check(struct pool *pp, struct pool_item_header *ph, void *page,
          void *object)
      {
  764         if (__predict_false((void *)ph->ph_page != page)) {
                      panic("%s: [%s] item %p not part of pool", __func__,
                          pp->pr_wchan, object);
              }
  766         if (__predict_false((char *)object < (char *)page + ph->ph_off)) {
                      panic("%s: [%s] item %p below item space", __func__,
                          pp->pr_wchan, object);
              }
  766         if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
                      panic("%s: [%s] item %p poolid %u != %u", __func__,
                          pp->pr_wchan, object, ph->ph_poolid, pp->pr_poolid);
              }
      }
      
      static inline void
      pc_phinpage_check(pool_cache_t pc, void *object)
      {
              struct pool_item_header *ph;
              struct pool *pp;
              void *page;
      
              pp = &pc->pc_pool;
  766         page = POOL_OBJ_TO_PAGE(pp, object);
              ph = (struct pool_item_header *)page;
      
  766         pr_phinpage_check(pp, ph, page, object);
      }
      
      /* -------------------------------------------------------------------------- */
      
      static inline int
      phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
      {
      
              /*
               * We consider pool_item_header with smaller ph_page bigger. This
               * unnatural ordering is for the benefit of pr_find_pagehead.
               */
              if (a->ph_page < b->ph_page)
                      return 1;
   58         else if (a->ph_page > b->ph_page)
                      return -1;
              else
                      return 0;
      }
      
  510 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
  541 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
      
      static inline struct pool_item_header *
      pr_find_pagehead_noalign(struct pool *pp, void *v)
      {
   40         struct pool_item_header *ph, tmp;
      
              tmp.ph_page = (void *)(uintptr_t)v;
   40         ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
   12         if (ph == NULL) {
                      ph = SPLAY_ROOT(&pp->pr_phtree);
   29                 if (ph != NULL && phtree_compare(&tmp, ph) >= 0) {
                              ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph);
                      }
   40                 KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0);
              }
      
              return ph;
      }
      
      /*
       * Return the pool page header based on item address.
       */
      static inline struct pool_item_header *
      pr_find_pagehead(struct pool *pp, void *v)
      {
  846         struct pool_item_header *ph, tmp;
      
              if ((pp->pr_roflags & PR_NOALIGN) != 0) {
   40                 ph = pr_find_pagehead_noalign(pp, v);
              } else {
  839                 void *page = POOL_OBJ_TO_PAGE(pp, v);
                      if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
                              ph = (struct pool_item_header *)page;
  764                         pr_phinpage_check(pp, ph, page, v);
                      } else {
  499                         tmp.ph_page = page;
  499                         ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
                      }
              }
      
  846         KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) ||
                  ((char *)ph->ph_page <= (char *)v &&
                  (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz));
              return ph;
      }
      
      static void
      pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq)
      {
              struct pool_item_header *ph;
      
  848         while ((ph = LIST_FIRST(pq)) != NULL) {
                      LIST_REMOVE(ph, ph_pagelist);
                      pool_allocator_free(pp, ph->ph_page);
                      if ((pp->pr_roflags & PR_PHINPAGE) == 0)
                              pool_put(pp->pr_phpool, ph);
              }
      }
      
      /*
       * Remove a page from the pool.
       */
      static inline void
      pr_rmpage(struct pool *pp, struct pool_item_header *ph,
           struct pool_pagelist *pq)
      {
      
              KASSERT(mutex_owned(&pp->pr_lock));
      
              /*
               * If the page was idle, decrement the idle page count.
               */
              if (ph->ph_nmissing == 0) {
                      KASSERT(pp->pr_nidle != 0);
                      KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage),
                          "%s: [%s] nitems=%u < itemsperpage=%u", __func__,
                          pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage);
                      pp->pr_nidle--;
              }
      
              pp->pr_nitems -= pp->pr_itemsperpage;
      
              /*
               * Unlink the page from the pool and queue it for release.
               */
              LIST_REMOVE(ph, ph_pagelist);
              if (pp->pr_roflags & PR_PHINPAGE) {
                      if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
                              panic("%s: [%s] ph %p poolid %u != %u",
                                  __func__, pp->pr_wchan, ph, ph->ph_poolid,
                                  pp->pr_poolid);
                      }
              } else {
                      SPLAY_REMOVE(phtree, &pp->pr_phtree, ph);
              }
              LIST_INSERT_HEAD(pq, ph, ph_pagelist);
      
              pp->pr_npages--;
              pp->pr_npagefree++;
      
              pool_update_curpage(pp);
      }
      
      /*
       * Initialize all the pools listed in the "pools" link set.
       */
      void
      pool_subsystem_init(void)
      {
              size_t size;
              int idx;
      
              mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE);
              mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE);
              cv_init(&pool_busy, "poolbusy");
      
              /*
               * Initialize private page header pool and cache magazine pool if we
               * haven't done so yet.
               */
              for (idx = 0; idx < PHPOOL_MAX; idx++) {
                      static char phpool_names[PHPOOL_MAX][6+1+6+1];
                      int nelem;
                      size_t sz;
      
                      nelem = PHPOOL_FREELIST_NELEM(idx);
                      KASSERT(nelem != 0);
                      snprintf(phpool_names[idx], sizeof(phpool_names[idx]),
                          "phpool-%d", nelem);
                      sz = offsetof(struct pool_item_header,
                          ph_bitmap[howmany(nelem, BITMAP_SIZE)]);
                      pool_init(&phpool[idx], sz, 0, 0, 0,
                          phpool_names[idx], &pool_allocator_meta, IPL_VM);
              }
      
              size = sizeof(pcg_t) +
                  (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t);
              pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0,
                  "pcgnormal", &pool_allocator_meta, IPL_VM);
      
              size = sizeof(pcg_t) +
                  (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t);
              pool_init(&pcg_large_pool, size, coherency_unit, 0, 0,
                  "pcglarge", &pool_allocator_meta, IPL_VM);
      
              pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit,
                  0, 0, "pcache", &pool_allocator_meta, IPL_NONE);
      
              pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit,
                  0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE);
      }
      
      static inline bool
      pool_init_is_phinpage(const struct pool *pp)
      {
              size_t pagesize;
      
              if (pp->pr_roflags & PR_PHINPAGE) {
                      return true;
              }
              if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) {
                      return false;
              }
      
              pagesize = pp->pr_alloc->pa_pagesz;
      
              /*
               * Threshold: the item size is below 1/16 of a page size, and below
               * 8 times the page header size. The latter ensures we go off-page
               * if the page header would make us waste a rather big item.
               */
              if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) {
                      return true;
              }
      
              /* Put the header into the page if it doesn't waste any items. */
              if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) {
                      return true;
              }
      
              return false;
      }
      
      static inline bool
      pool_init_is_usebmap(const struct pool *pp)
      {
              size_t bmapsize;
      
              if (pp->pr_roflags & PR_NOTOUCH) {
                      return true;
              }
      
              /*
               * If we're off-page, go with a bitmap.
               */
              if (!(pp->pr_roflags & PR_PHINPAGE)) {
                      return true;
              }
      
              /*
               * If we're on-page, and the page header can already contain a bitmap
               * big enough to cover all the items of the page, go with a bitmap.
               */
              bmapsize = roundup(PHSIZE, pp->pr_align) -
                  offsetof(struct pool_item_header, ph_bitmap[0]);
              KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0);
              if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) {
                      return true;
              }
      
              return false;
      }
      
      /*
       * Initialize the given pool resource structure.
       *
       * We export this routine to allow other kernel parts to declare
       * static pools that must be initialized before kmem(9) is available.
       */
      void
      pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
          const char *wchan, struct pool_allocator *palloc, int ipl)
      {
              struct pool *pp1;
              size_t prsize;
              int itemspace, slack;
      
              /* XXX ioff will be removed. */
              KASSERT(ioff == 0);
      
      #ifdef DEBUG
              if (__predict_true(!cold))
                      mutex_enter(&pool_head_lock);
              /*
               * Check that the pool hasn't already been initialised and
               * added to the list of all pools.
               */
              TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
                      if (pp == pp1)
                              panic("%s: [%s] already initialised", __func__,
                                  wchan);
              }
              if (__predict_true(!cold))
                      mutex_exit(&pool_head_lock);
      #endif
      
              if (palloc == NULL)
                      palloc = &pool_allocator_kmem;
      
              if (!cold)
                      mutex_enter(&pool_allocator_lock);
              if (palloc->pa_refcnt++ == 0) {
                      if (palloc->pa_pagesz == 0)
                              palloc->pa_pagesz = PAGE_SIZE;
      
                      TAILQ_INIT(&palloc->pa_list);
      
                      mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM);
                      palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
                      palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
              }
              if (!cold)
                      mutex_exit(&pool_allocator_lock);
      
              if (align == 0)
                      align = ALIGN(1);
      
              prsize = size;
              if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item))
                      prsize = sizeof(struct pool_item);
      
              prsize = roundup(prsize, align);
              KASSERTMSG((prsize <= palloc->pa_pagesz),
                  "%s: [%s] pool item size (%zu) larger than page size (%u)",
                  __func__, wchan, prsize, palloc->pa_pagesz);
      
              /*
               * Initialize the pool structure.
               */
              LIST_INIT(&pp->pr_emptypages);
              LIST_INIT(&pp->pr_fullpages);
              LIST_INIT(&pp->pr_partpages);
              pp->pr_cache = NULL;
              pp->pr_curpage = NULL;
              pp->pr_npages = 0;
              pp->pr_minitems = 0;
              pp->pr_minpages = 0;
              pp->pr_maxpages = UINT_MAX;
              pp->pr_roflags = flags;
              pp->pr_flags = 0;
              pp->pr_size = prsize;
              pp->pr_reqsize = size;
              pp->pr_align = align;
              pp->pr_wchan = wchan;
              pp->pr_alloc = palloc;
              pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter);
              pp->pr_nitems = 0;
              pp->pr_nout = 0;
              pp->pr_hardlimit = UINT_MAX;
              pp->pr_hardlimit_warning = NULL;
              pp->pr_hardlimit_ratecap.tv_sec = 0;
              pp->pr_hardlimit_ratecap.tv_usec = 0;
              pp->pr_hardlimit_warning_last.tv_sec = 0;
              pp->pr_hardlimit_warning_last.tv_usec = 0;
              pp->pr_drain_hook = NULL;
              pp->pr_drain_hook_arg = NULL;
              pp->pr_freecheck = NULL;
              pp->pr_redzone = false;
              pool_redzone_init(pp, size);
              pool_quarantine_init(pp);
      
              /*
               * Decide whether to put the page header off-page to avoid wasting too
               * large a part of the page or too big an item. Off-page page headers
               * go on a hash table, so we can match a returned item with its header
               * based on the page address.
               */
              if (pool_init_is_phinpage(pp)) {
                      /* Use the beginning of the page for the page header */
                      itemspace = palloc->pa_pagesz - roundup(PHSIZE, align);
                      pp->pr_itemoffset = roundup(PHSIZE, align);
                      pp->pr_roflags |= PR_PHINPAGE;
              } else {
                      /* The page header will be taken from our page header pool */
                      itemspace = palloc->pa_pagesz;
                      pp->pr_itemoffset = 0;
                      SPLAY_INIT(&pp->pr_phtree);
              }
      
              pp->pr_itemsperpage = itemspace / pp->pr_size;
              KASSERT(pp->pr_itemsperpage != 0);
      
              /*
               * Decide whether to use a bitmap or a linked list to manage freed
               * items.
               */
              if (pool_init_is_usebmap(pp)) {
                      pp->pr_roflags |= PR_USEBMAP;
              }
      
              /*
               * If we're off-page, then we're using a bitmap; choose the appropriate
               * pool to allocate page headers, whose size varies depending on the
               * bitmap. If we're on-page, nothing to do.
               */
              if (!(pp->pr_roflags & PR_PHINPAGE)) {
                      int idx;
      
                      KASSERT(pp->pr_roflags & PR_USEBMAP);
      
                      for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx);
                          idx++) {
                              /* nothing */
                      }
                      if (idx >= PHPOOL_MAX) {
                              /*
                               * if you see this panic, consider to tweak
                               * PHPOOL_MAX and PHPOOL_FREELIST_NELEM.
                               */
                              panic("%s: [%s] too large itemsperpage(%d) for "
                                  "PR_USEBMAP", __func__,
                                  pp->pr_wchan, pp->pr_itemsperpage);
                      }
                      pp->pr_phpool = &phpool[idx];
              } else {
                      pp->pr_phpool = NULL;
              }
      
              /*
               * Use the slack between the chunks and the page header
               * for "cache coloring".
               */
              slack = itemspace - pp->pr_itemsperpage * pp->pr_size;
              pp->pr_maxcolor = rounddown(slack, align);
              pp->pr_curcolor = 0;
      
              pp->pr_nget = 0;
              pp->pr_nfail = 0;
              pp->pr_nput = 0;
              pp->pr_npagealloc = 0;
              pp->pr_npagefree = 0;
              pp->pr_hiwat = 0;
              pp->pr_nidle = 0;
              pp->pr_refcnt = 0;
      
              mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl);
              cv_init(&pp->pr_cv, wchan);
              pp->pr_ipl = ipl;
      
              /* Insert into the list of all pools. */
              if (!cold)
                      mutex_enter(&pool_head_lock);
              TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
                      if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0)
                              break;
              }
              if (pp1 == NULL)
                      TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
              else
                      TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist);
              if (!cold)
                      mutex_exit(&pool_head_lock);
      
              /* Insert this into the list of pools using this allocator. */
              if (!cold)
                      mutex_enter(&palloc->pa_lock);
              TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
              if (!cold)
                      mutex_exit(&palloc->pa_lock);
      }
      
      /*
       * De-commision a pool resource.
       */
      void
      pool_destroy(struct pool *pp)
      {
              struct pool_pagelist pq;
              struct pool_item_header *ph;
      
              pool_quarantine_flush(pp);
      
              /* Remove from global pool list */
              mutex_enter(&pool_head_lock);
              while (pp->pr_refcnt != 0)
                      cv_wait(&pool_busy, &pool_head_lock);
              TAILQ_REMOVE(&pool_head, pp, pr_poollist);
              if (drainpp == pp)
                      drainpp = NULL;
              mutex_exit(&pool_head_lock);
      
              /* Remove this pool from its allocator's list of pools. */
              mutex_enter(&pp->pr_alloc->pa_lock);
              TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
              mutex_exit(&pp->pr_alloc->pa_lock);
      
              mutex_enter(&pool_allocator_lock);
              if (--pp->pr_alloc->pa_refcnt == 0)
                      mutex_destroy(&pp->pr_alloc->pa_lock);
              mutex_exit(&pool_allocator_lock);
      
              mutex_enter(&pp->pr_lock);
      
              KASSERT(pp->pr_cache == NULL);
              KASSERTMSG((pp->pr_nout == 0),
                  "%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan,
                  pp->pr_nout);
              KASSERT(LIST_EMPTY(&pp->pr_fullpages));
              KASSERT(LIST_EMPTY(&pp->pr_partpages));
      
              /* Remove all pages */
              LIST_INIT(&pq);
              while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
                      pr_rmpage(pp, ph, &pq);
      
              mutex_exit(&pp->pr_lock);
      
              pr_pagelist_free(pp, &pq);
              cv_destroy(&pp->pr_cv);
              mutex_destroy(&pp->pr_lock);
      }
      
      void
      pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg)
      {
      
              /* XXX no locking -- must be used just after pool_init() */
              KASSERTMSG((pp->pr_drain_hook == NULL),
                  "%s: [%s] already set", __func__, pp->pr_wchan);
              pp->pr_drain_hook = fn;
              pp->pr_drain_hook_arg = arg;
      }
      
      static struct pool_item_header *
      pool_alloc_item_header(struct pool *pp, void *storage, int flags)
      {
              struct pool_item_header *ph;
      
  104         if ((pp->pr_roflags & PR_PHINPAGE) != 0)
                      ph = storage;
              else
   58                 ph = pool_get(pp->pr_phpool, flags);
      
              return ph;
      }
      
      /*
       * Grab an item from the pool.
       */
      void *
      pool_get(struct pool *pp, int flags)
      {
              struct pool_item_header *ph;
              void *v;
      
 1050         KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
 1050         KASSERTMSG((pp->pr_itemsperpage != 0),
                  "%s: [%s] pr_itemsperpage is zero, "
                  "pool not initialized?", __func__, pp->pr_wchan);
 1050         KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p())
                      || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL),
                  "%s: [%s] is IPL_NONE, but called from interrupt context",
                  __func__, pp->pr_wchan);
 1050         if (flags & PR_WAITOK) {
  976                 ASSERT_SLEEPABLE();
              }
      
 1050         mutex_enter(&pp->pr_lock);
       startover:
              /*
               * Check to see if we've reached the hard limit.  If we have,
               * and we can wait, then wait until an item has been returned to
               * the pool.
               */
 1050         KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit),
                  "%s: %s: crossed hard limit", __func__, pp->pr_wchan);
 1050         if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
                      if (pp->pr_drain_hook != NULL) {
                              /*
                               * Since the drain hook is going to free things
                               * back to the pool, unlock, call the hook, re-lock,
                               * and check the hardlimit condition again.
                               */
                              mutex_exit(&pp->pr_lock);
                              (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
                              mutex_enter(&pp->pr_lock);
                              if (pp->pr_nout < pp->pr_hardlimit)
                                      goto startover;
                      }
      
                      if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
                              /*
                               * XXX: A warning isn't logged in this case.  Should
                               * it be?
                               */
                              pp->pr_flags |= PR_WANTED;
                              do {
                                      cv_wait(&pp->pr_cv, &pp->pr_lock);
                              } while (pp->pr_flags & PR_WANTED);
                              goto startover;
                      }
      
                      /*
                       * Log a message that the hard limit has been hit.
                       */
                      if (pp->pr_hardlimit_warning != NULL &&
                          ratecheck(&pp->pr_hardlimit_warning_last,
                                    &pp->pr_hardlimit_ratecap))
                              log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
      
                      pp->pr_nfail++;
      
                      mutex_exit(&pp->pr_lock);
                      KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
                      return NULL;
              }
      
              /*
               * The convention we use is that if `curpage' is not NULL, then
               * it points at a non-empty bucket. In particular, `curpage'
               * never points at a page header which has PR_PHINPAGE set and
               * has no items in its bucket.
               */
 1050         if ((ph = pp->pr_curpage) == NULL) {
                      int error;
      
   99                 KASSERTMSG((pp->pr_nitems == 0),
                          "%s: [%s] curpage NULL, inconsistent nitems %u",
                          __func__, pp->pr_wchan, pp->pr_nitems);
      
                      /*
                       * Call the back-end page allocator for more memory.
                       * Release the pool lock, as the back-end page allocator
                       * may block.
                       */
   99                 error = pool_grow(pp, flags);
                      if (error != 0) {
                              /*
                               * pool_grow aborts when another thread
                               * is allocating a new page. Retry if it
                               * waited for it.
                               */
    2                         if (error == ERESTART)
                                      goto startover;
      
                              /*
                               * We were unable to allocate a page or item
                               * header, but we released the lock during
                               * allocation, so perhaps items were freed
                               * back to the pool.  Check for this case.
                               */
   99                         if (pp->pr_curpage != NULL)
                                      goto startover;
      
                              pp->pr_nfail++;
                              mutex_exit(&pp->pr_lock);
                              KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT);
                              return NULL;
                      }
      
                      /* Start the allocation process over. */
                      goto startover;
              }
 1050         if (pp->pr_roflags & PR_USEBMAP) {
 1029                 KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage),
                          "%s: [%s] pool page empty", __func__, pp->pr_wchan);
 1029                 v = pr_item_bitmap_get(pp, ph);
              } else {
  294                 v = pr_item_linkedlist_get(pp, ph);
              }
 1050         pp->pr_nitems--;
              pp->pr_nout++;
              if (ph->ph_nmissing == 0) {
  241                 KASSERT(pp->pr_nidle > 0);
  241                 pp->pr_nidle--;
      
                      /*
                       * This page was previously empty.  Move it to the list of
                       * partially-full pages.  This page is already curpage.
                       */
  241                 LIST_REMOVE(ph, ph_pagelist);
  241                 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
              }
 1050         ph->ph_nmissing++;
              if (ph->ph_nmissing == pp->pr_itemsperpage) {
  960                 KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) ||
                              LIST_EMPTY(&ph->ph_itemlist)),
                          "%s: [%s] nmissing (%u) inconsistent", __func__,
                              pp->pr_wchan, ph->ph_nmissing);
                      /*
                       * This page is now full.  Move it to the full list
                       * and select a new current page.
                       */
  960                 LIST_REMOVE(ph, ph_pagelist);
  960                 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
                      pool_update_curpage(pp);
              }
      
 1050         pp->pr_nget++;
      
              /*
               * If we have a low water mark and we are now below that low
               * water mark, add more items to the pool.
               */
    7         if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
                      /*
                       * XXX: Should we log a warning?  Should we set up a timeout
                       * to try again in a second or so?  The latter could break
                       * a caller's assumptions about interrupt protection, etc.
                       */
              }
      
 1050         mutex_exit(&pp->pr_lock);
              KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0);
 1050         FREECHECK_OUT(&pp->pr_freecheck, v);
 1002         pool_redzone_fill(pp, v);
 1050         if (flags & PR_ZERO)
 1050                 memset(v, 0, pp->pr_reqsize);
              else
                      pool_kleak_fill(pp, v);
              return v;
      }
      
      /*
       * Internal version of pool_put().  Pool is already locked/entered.
       */
      static void
      pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq)
      {
              struct pool_item_header *ph;
      
  846         KASSERT(mutex_owned(&pp->pr_lock));
  846         pool_redzone_check(pp, v);
  846         FREECHECK_IN(&pp->pr_freecheck, v);
              LOCKDEBUG_MEM_CHECK(v, pp->pr_size);
      
              KASSERTMSG((pp->pr_nout > 0),
                  "%s: [%s] putting with none out", __func__, pp->pr_wchan);
      
  846         if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) {
                      panic("%s: [%s] page header missing", __func__,  pp->pr_wchan);
              }
      
              /*
               * Return to item list.
               */
              if (pp->pr_roflags & PR_USEBMAP) {
  823                 pr_item_bitmap_put(pp, ph, v);
              } else {
   75                 pr_item_linkedlist_put(pp, ph, v);
              }
  846         KDASSERT(ph->ph_nmissing != 0);
  846         ph->ph_nmissing--;
              pp->pr_nput++;
              pp->pr_nitems++;
              pp->pr_nout--;
      
              /* Cancel "pool empty" condition if it exists */
              if (pp->pr_curpage == NULL)
   32                 pp->pr_curpage = ph;
      
  846         if (pp->pr_flags & PR_WANTED) {
                      pp->pr_flags &= ~PR_WANTED;
                      cv_broadcast(&pp->pr_cv);
              }
      
              /*
               * If this page is now empty, do one of two things:
               *
               *        (1) If we have more pages than the page high water mark,
               *            free the page back to the system.  ONLY CONSIDER
               *            FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE
               *            CLAIM.
               *
               *        (2) Otherwise, move the page to the empty page list.
               *
               * Either way, select a new current page (so we use a partially-full
               * page if one is available).
               */
  846         if (ph->ph_nmissing == 0) {
  104                 pp->pr_nidle++;
                      if (pp->pr_npages > pp->pr_minpages &&
  104                     pp->pr_npages > pp->pr_maxpages) {
                              pr_rmpage(pp, ph, pq);
                      } else {
  104                         LIST_REMOVE(ph, ph_pagelist);
  104                         LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
      
                              /*
                               * Update the timestamp on the page.  A page must
                               * be idle for some period of time before it can
                               * be reclaimed by the pagedaemon.  This minimizes
                               * ping-pong'ing for memory.
                               *
                               * note for 64-bit time_t: truncating to 32-bit is not
                               * a problem for our usage.
                               */
                              ph->ph_time = time_uptime;
                      }
  104                 pool_update_curpage(pp);
              }
      
              /*
               * If the page was previously completely full, move it to the
               * partially-full list and make it the current page.  The next
               * allocation will get the item from this page, instead of
               * further fragmenting the pool.
               */
  830         else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
  769                 LIST_REMOVE(ph, ph_pagelist);
  769                 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
                      pp->pr_curpage = ph;
              }
  846 }
      
      void
      pool_put(struct pool *pp, void *v)
      {
  848         struct pool_pagelist pq;
      
              LIST_INIT(&pq);
      
              mutex_enter(&pp->pr_lock);
  792         if (!pool_put_quarantine(pp, v, &pq)) {
  467                 pool_do_put(pp, v, &pq);
              }
  848         mutex_exit(&pp->pr_lock);
      
              pr_pagelist_free(pp, &pq);
      }
      
      /*
       * pool_grow: grow a pool by a page.
       *
       * => called with pool locked.
       * => unlock and relock the pool.
       * => return with pool locked.
       */
      
      static int
      pool_grow(struct pool *pp, int flags)
      {
              struct pool_item_header *ph;
              char *storage;
      
              /*
               * If there's a pool_grow in progress, wait for it to complete
               * and try again from the top.
               */
  104         if (pp->pr_flags & PR_GROWING) {
    2                 if (flags & PR_WAITOK) {
                              do {
    2                                 cv_wait(&pp->pr_cv, &pp->pr_lock);
                              } while (pp->pr_flags & PR_GROWING);
                              return ERESTART;
                      } else {
  104                         if (pp->pr_flags & PR_GROWINGNOWAIT) {
                                      /*
                                       * This needs an unlock/relock dance so
                                       * that the other caller has a chance to
                                       * run and actually do the thing.  Note
                                       * that this is effectively a busy-wait.
                                       */
                                      mutex_exit(&pp->pr_lock);
                                      mutex_enter(&pp->pr_lock);
                                      return ERESTART;
                              }
                              return EWOULDBLOCK;
                      }
              }
  104         pp->pr_flags |= PR_GROWING;
              if (flags & PR_WAITOK)
   72                 mutex_exit(&pp->pr_lock);
              else
   57                 pp->pr_flags |= PR_GROWINGNOWAIT;
      
              storage = pool_allocator_alloc(pp, flags);
              if (__predict_false(storage == NULL))
                      goto out;
      
  104         ph = pool_alloc_item_header(pp, storage, flags);
   56         if (__predict_false(ph == NULL)) {
                      pool_allocator_free(pp, storage);
                      goto out;
              }
      
              if (flags & PR_WAITOK)
   72                 mutex_enter(&pp->pr_lock);
  104         pool_prime_page(pp, storage, ph);
  104         pp->pr_npagealloc++;
              KASSERT(pp->pr_flags & PR_GROWING);
  104         pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
              /*
               * If anyone was waiting for pool_grow, notify them that we
               * may have just done it.
               */
              cv_broadcast(&pp->pr_cv);
              return 0;
      out:
              if (flags & PR_WAITOK)
                      mutex_enter(&pp->pr_lock);
              KASSERT(pp->pr_flags & PR_GROWING);
              pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
              return ENOMEM;
      }
      
      /*
       * Add N items to the pool.
       */
      int
      pool_prime(struct pool *pp, int n)
      {
              int newpages;
              int error = 0;
      
              mutex_enter(&pp->pr_lock);
      
              newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
      
              while (newpages > 0) {
                      error = pool_grow(pp, PR_NOWAIT);
                      if (error) {
                              if (error == ERESTART)
                                      continue;
                              break;
                      }
                      pp->pr_minpages++;
                      newpages--;
              }
      
              if (pp->pr_minpages >= pp->pr_maxpages)
                      pp->pr_maxpages = pp->pr_minpages + 1;        /* XXX */
      
              mutex_exit(&pp->pr_lock);
              return error;
      }
      
      /*
       * Add a page worth of items to the pool.
       *
       * Note, we must be called with the pool descriptor LOCKED.
       */
      static void
      pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph)
      {
  104         const unsigned int align = pp->pr_align;
              struct pool_item *pi;
              void *cp = storage;
              int n;
      
              KASSERT(mutex_owned(&pp->pr_lock));
  104         KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) ||
                      (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)),
                  "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp);
      
              /*
               * Insert page header.
               */
  104         LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
              LIST_INIT(&ph->ph_itemlist);
              ph->ph_page = storage;
              ph->ph_nmissing = 0;
              ph->ph_time = time_uptime;
              if (pp->pr_roflags & PR_PHINPAGE)
   93                 ph->ph_poolid = pp->pr_poolid;
              else
   58                 SPLAY_INSERT(phtree, &pp->pr_phtree, ph);
      
  104         pp->pr_nidle++;
      
              /*
               * The item space starts after the on-page header, if any.
               */
              ph->ph_off = pp->pr_itemoffset;
      
              /*
               * Color this page.
               */
              ph->ph_off += pp->pr_curcolor;
              cp = (char *)cp + ph->ph_off;
              if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
   65                 pp->pr_curcolor = 0;
      
  104         KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
      
              /*
               * Insert remaining chunks on the bucket list.
               */
  104         n = pp->pr_itemsperpage;
              pp->pr_nitems += n;
      
              if (pp->pr_roflags & PR_USEBMAP) {
  104                 pr_item_bitmap_init(pp, ph);
              } else {
    2                 while (n--) {
                              pi = (struct pool_item *)cp;
      
    2                         KASSERT((((vaddr_t)pi) & (align - 1)) == 0);
      
                              /* Insert on page list */
    5                         LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
      #ifdef POOL_CHECK_MAGIC
                              pi->pi_magic = PI_MAGIC;
      #endif
                              cp = (char *)cp + pp->pr_size;
      
                              KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
                      }
              }
      
              /*
               * If the pool was depleted, point at the new page.
               */
  104         if (pp->pr_curpage == NULL)
  104                 pp->pr_curpage = ph;
      
  104         if (++pp->pr_npages > pp->pr_hiwat)
  104                 pp->pr_hiwat = pp->pr_npages;
      }
      
      /*
       * Used by pool_get() when nitems drops below the low water mark.  This
       * is used to catch up pr_nitems with the low water mark.
       *
       * Note 1, we never wait for memory here, we let the caller decide what to do.
       *
       * Note 2, we must be called with the pool already locked, and we return
       * with it locked.
       */
      static int
      pool_catchup(struct pool *pp)
      {
              int error = 0;
      
    7         while (POOL_NEEDS_CATCHUP(pp)) {
    7                 error = pool_grow(pp, PR_NOWAIT);
                      if (error) {
                              if (error == ERESTART)
                                      continue;
                              break;
                      }
              }
    7         return error;
      }
      
      static void
      pool_update_curpage(struct pool *pp)
      {
      
  971         pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
              if (pp->pr_curpage == NULL) {
  321                 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
              }
  971         KASSERT((pp->pr_curpage == NULL && pp->pr_nitems == 0) ||
                  (pp->pr_curpage != NULL && pp->pr_nitems > 0));
      }
      
      void
      pool_setlowat(struct pool *pp, int n)
      {
      
              mutex_enter(&pp->pr_lock);
      
              pp->pr_minitems = n;
              pp->pr_minpages = (n == 0)
                      ? 0
                      : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
      
              /* Make sure we're caught up with the newly-set low water mark. */
              if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
                      /*
                       * XXX: Should we log a warning?  Should we set up a timeout
                       * to try again in a second or so?  The latter could break
                       * a caller's assumptions about interrupt protection, etc.
                       */
              }
      
              mutex_exit(&pp->pr_lock);
      }
      
      void
      pool_sethiwat(struct pool *pp, int n)
      {
      
              mutex_enter(&pp->pr_lock);
      
              pp->pr_maxpages = (n == 0)
                      ? 0
                      : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
      
              mutex_exit(&pp->pr_lock);
      }
      
      void
      pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap)
      {
      
              mutex_enter(&pp->pr_lock);
      
              pp->pr_hardlimit = n;
              pp->pr_hardlimit_warning = warnmess;
              pp->pr_hardlimit_ratecap.tv_sec = ratecap;
              pp->pr_hardlimit_warning_last.tv_sec = 0;
              pp->pr_hardlimit_warning_last.tv_usec = 0;
      
              /*
               * In-line version of pool_sethiwat(), because we don't want to
               * release the lock.
               */
              pp->pr_maxpages = (n == 0)
                      ? 0
                      : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
      
              mutex_exit(&pp->pr_lock);
      }
      
      /*
       * Release all complete pages that have not been used recently.
       *
       * Must not be called from interrupt context.
       */
      int
      pool_reclaim(struct pool *pp)
      {
              struct pool_item_header *ph, *phnext;
              struct pool_pagelist pq;
              uint32_t curtime;
              bool klock;
              int rv;
      
              KASSERT(!cpu_intr_p() && !cpu_softintr_p());
      
              if (pp->pr_drain_hook != NULL) {
                      /*
                       * The drain hook must be called with the pool unlocked.
                       */
                      (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT);
              }
      
              /*
               * XXXSMP Because we do not want to cause non-MPSAFE code
               * to block.
               */
              if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK ||
                  pp->pr_ipl == IPL_SOFTSERIAL) {
                      KERNEL_LOCK(1, NULL);
                      klock = true;
              } else
                      klock = false;
      
              /* Reclaim items from the pool's cache (if any). */
              if (pp->pr_cache != NULL)
                      pool_cache_invalidate(pp->pr_cache);
      
              if (mutex_tryenter(&pp->pr_lock) == 0) {
                      if (klock) {
                              KERNEL_UNLOCK_ONE(NULL);
                      }
                      return 0;
              }
      
              LIST_INIT(&pq);
      
              curtime = time_uptime;
      
              for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
                      phnext = LIST_NEXT(ph, ph_pagelist);
      
                      /* Check our minimum page claim */
                      if (pp->pr_npages <= pp->pr_minpages)
                              break;
      
                      KASSERT(ph->ph_nmissing == 0);
                      if (curtime - ph->ph_time < pool_inactive_time)
                              continue;
      
                      /*
                       * If freeing this page would put us below
                       * the low water mark, stop now.
                       */
                      if ((pp->pr_nitems - pp->pr_itemsperpage) <
                          pp->pr_minitems)
                              break;
      
                      pr_rmpage(pp, ph, &pq);
              }
      
              mutex_exit(&pp->pr_lock);
      
              if (LIST_EMPTY(&pq))
                      rv = 0;
              else {
                      pr_pagelist_free(pp, &pq);
                      rv = 1;
              }
      
              if (klock) {
                      KERNEL_UNLOCK_ONE(NULL);
              }
      
              return rv;
      }
      
      /*
       * Drain pools, one at a time. The drained pool is returned within ppp.
       *
       * Note, must never be called from interrupt context.
       */
      bool
      pool_drain(struct pool **ppp)
      {
              bool reclaimed;
              struct pool *pp;
      
              KASSERT(!TAILQ_EMPTY(&pool_head));
      
              pp = NULL;
      
              /* Find next pool to drain, and add a reference. */
              mutex_enter(&pool_head_lock);
              do {
                      if (drainpp == NULL) {
                              drainpp = TAILQ_FIRST(&pool_head);
                      }
                      if (drainpp != NULL) {
                              pp = drainpp;
                              drainpp = TAILQ_NEXT(pp, pr_poollist);
                      }
                      /*
                       * Skip completely idle pools.  We depend on at least
                       * one pool in the system being active.
                       */
              } while (pp == NULL || pp->pr_npages == 0);
              pp->pr_refcnt++;
              mutex_exit(&pool_head_lock);
      
              /* Drain the cache (if any) and pool.. */
              reclaimed = pool_reclaim(pp);
      
              /* Finally, unlock the pool. */
              mutex_enter(&pool_head_lock);
              pp->pr_refcnt--;
              cv_broadcast(&pool_busy);
              mutex_exit(&pool_head_lock);
      
              if (ppp != NULL)
                      *ppp = pp;
      
              return reclaimed;
      }
      
      /*
       * Calculate the total number of pages consumed by pools.
       */
      int
      pool_totalpages(void)
      {
      
              mutex_enter(&pool_head_lock);
              int pages = pool_totalpages_locked();
              mutex_exit(&pool_head_lock);
      
              return pages;
      }
      
      int
      pool_totalpages_locked(void)
      {
              struct pool *pp;
              uint64_t total = 0;
      
              TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                      uint64_t bytes = pp->pr_npages * pp->pr_alloc->pa_pagesz;
      
                      if ((pp->pr_roflags & PR_RECURSIVE) != 0)
                              bytes -= (pp->pr_nout * pp->pr_size);
                      total += bytes;
              }
      
              return atop(total);
      }
      
      /*
       * Diagnostic helpers.
       */
      
      void
      pool_printall(const char *modif, void (*pr)(const char *, ...))
      {
              struct pool *pp;
      
              TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                      pool_printit(pp, modif, pr);
              }
      }
      
      void
      pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
      {
      
              if (pp == NULL) {
                      (*pr)("Must specify a pool to print.\n");
                      return;
              }
      
              pool_print1(pp, modif, pr);
      }
      
      static void
      pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl,
          void (*pr)(const char *, ...))
      {
              struct pool_item_header *ph;
      
              LIST_FOREACH(ph, pl, ph_pagelist) {
                      (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n",
                          ph->ph_page, ph->ph_nmissing, ph->ph_time);
      #ifdef POOL_CHECK_MAGIC
                      struct pool_item *pi;
                      if (!(pp->pr_roflags & PR_USEBMAP)) {
                              LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
                                      if (pi->pi_magic != PI_MAGIC) {
                                              (*pr)("\t\t\titem %p, magic 0x%x\n",
                                                  pi, pi->pi_magic);
                                      }
                              }
                      }
      #endif
              }
      }
      
      static void
      pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
      {
              struct pool_item_header *ph;
              pool_cache_t pc;
              pcg_t *pcg;
              pool_cache_cpu_t *cc;
              uint64_t cpuhit, cpumiss;
              int i, print_log = 0, print_pagelist = 0, print_cache = 0;
              char c;
      
              while ((c = *modif++) != '\0') {
                      if (c == 'l')
                              print_log = 1;
                      if (c == 'p')
                              print_pagelist = 1;
                      if (c == 'c')
                              print_cache = 1;
              }
      
              if ((pc = pp->pr_cache) != NULL) {
                      (*pr)("POOL CACHE");
              } else {
                      (*pr)("POOL");
              }
      
              (*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
                  pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
                  pp->pr_roflags);
              (*pr)("\talloc %p\n", pp->pr_alloc);
              (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
                  pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
              (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
                  pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
      
              (*pr)("\tnget %lu, nfail %lu, nput %lu\n",
                  pp->pr_nget, pp->pr_nfail, pp->pr_nput);
              (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
                  pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
      
              if (print_pagelist == 0)
                      goto skip_pagelist;
      
              if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
                      (*pr)("\n\tempty page list:\n");
              pool_print_pagelist(pp, &pp->pr_emptypages, pr);
              if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
                      (*pr)("\n\tfull page list:\n");
              pool_print_pagelist(pp, &pp->pr_fullpages, pr);
              if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
                      (*pr)("\n\tpartial-page list:\n");
              pool_print_pagelist(pp, &pp->pr_partpages, pr);
      
              if (pp->pr_curpage == NULL)
                      (*pr)("\tno current page\n");
              else
                      (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
      
       skip_pagelist:
              if (print_log == 0)
                      goto skip_log;
      
              (*pr)("\n");
      
       skip_log:
      
      #define PR_GROUPLIST(pcg)                                                \
              (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail);                \
              for (i = 0; i < pcg->pcg_size; i++) {                                \
                      if (pcg->pcg_objects[i].pcgo_pa !=                        \
                          POOL_PADDR_INVALID) {                                \
                              (*pr)("\t\t\t%p, 0x%llx\n",                        \
                                  pcg->pcg_objects[i].pcgo_va,                \
                                  (unsigned long long)                        \
                                  pcg->pcg_objects[i].pcgo_pa);                \
                      } else {                                                \
                              (*pr)("\t\t\t%p\n",                                \
                                  pcg->pcg_objects[i].pcgo_va);                \
                      }                                                        \
              }
      
              if (pc != NULL) {
                      cpuhit = 0;
                      cpumiss = 0;
                      for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
                              if ((cc = pc->pc_cpus[i]) == NULL)
                                      continue;
                              cpuhit += cc->cc_hits;
                              cpumiss += cc->cc_misses;
                      }
                      (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss);
                      (*pr)("\tcache layer hits %llu misses %llu\n",
                          pc->pc_hits, pc->pc_misses);
                      (*pr)("\tcache layer entry uncontended %llu contended %llu\n",
                          pc->pc_hits + pc->pc_misses - pc->pc_contended,
                          pc->pc_contended);
                      (*pr)("\tcache layer empty groups %u full groups %u\n",
                          pc->pc_nempty, pc->pc_nfull);
                      if (print_cache) {
                              (*pr)("\tfull cache groups:\n");
                              for (pcg = pc->pc_fullgroups; pcg != NULL;
                                  pcg = pcg->pcg_next) {
                                      PR_GROUPLIST(pcg);
                              }
                              (*pr)("\tempty cache groups:\n");
                              for (pcg = pc->pc_emptygroups; pcg != NULL;
                                  pcg = pcg->pcg_next) {
                                      PR_GROUPLIST(pcg);
                              }
                      }
              }
      #undef PR_GROUPLIST
      }
      
      static int
      pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph)
      {
              struct pool_item *pi;
              void *page;
              int n;
      
              if ((pp->pr_roflags & PR_NOALIGN) == 0) {
                      page = POOL_OBJ_TO_PAGE(pp, ph);
                      if (page != ph->ph_page &&
                          (pp->pr_roflags & PR_PHINPAGE) != 0) {
                              if (label != NULL)
                                      printf("%s: ", label);
                              printf("pool(%p:%s): page inconsistency: page %p;"
                                     " at page head addr %p (p %p)\n", pp,
                                      pp->pr_wchan, ph->ph_page,
                                      ph, page);
                              return 1;
                      }
              }
      
              if ((pp->pr_roflags & PR_USEBMAP) != 0)
                      return 0;
      
              for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0;
                   pi != NULL;
                   pi = LIST_NEXT(pi,pi_list), n++) {
      
      #ifdef POOL_CHECK_MAGIC
                      if (pi->pi_magic != PI_MAGIC) {
                              if (label != NULL)
                                      printf("%s: ", label);
                              printf("pool(%s): free list modified: magic=%x;"
                                     " page %p; item ordinal %d; addr %p\n",
                                      pp->pr_wchan, pi->pi_magic, ph->ph_page,
                                      n, pi);
                              panic("pool");
                      }
      #endif
                      if ((pp->pr_roflags & PR_NOALIGN) != 0) {
                              continue;
                      }
                      page = POOL_OBJ_TO_PAGE(pp, pi);
                      if (page == ph->ph_page)
                              continue;
      
                      if (label != NULL)
                              printf("%s: ", label);
                      printf("pool(%p:%s): page inconsistency: page %p;"
                             " item ordinal %d; addr %p (p %p)\n", pp,
                              pp->pr_wchan, ph->ph_page,
                              n, pi, page);
                      return 1;
              }
              return 0;
      }
      
      
      int
      pool_chk(struct pool *pp, const char *label)
      {
              struct pool_item_header *ph;
              int r = 0;
      
              mutex_enter(&pp->pr_lock);
              LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
                      r = pool_chk_page(pp, label, ph);
                      if (r) {
                              goto out;
                      }
              }
              LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
                      r = pool_chk_page(pp, label, ph);
                      if (r) {
                              goto out;
                      }
              }
              LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
                      r = pool_chk_page(pp, label, ph);
                      if (r) {
                              goto out;
                      }
              }
      
      out:
              mutex_exit(&pp->pr_lock);
              return r;
      }
      
      /*
       * pool_cache_init:
       *
       *        Initialize a pool cache.
       */
      pool_cache_t
      pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags,
          const char *wchan, struct pool_allocator *palloc, int ipl,
          int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg)
      {
              pool_cache_t pc;
      
              pc = pool_get(&cache_pool, PR_WAITOK);
              if (pc == NULL)
                      return NULL;
      
              pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan,
                 palloc, ipl, ctor, dtor, arg);
      
              return pc;
      }
      
      /*
       * pool_cache_bootstrap:
       *
       *        Kernel-private version of pool_cache_init().  The caller
       *        provides initial storage.
       */
      void
      pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align,
          u_int align_offset, u_int flags, const char *wchan,
          struct pool_allocator *palloc, int ipl,
          int (*ctor)(void *, void *, int), void (*dtor)(void *, void *),
          void *arg)
      {
              CPU_INFO_ITERATOR cii;
              pool_cache_t pc1;
              struct cpu_info *ci;
              struct pool *pp;
      
              pp = &pc->pc_pool;
              if (palloc == NULL && ipl == IPL_NONE) {
                      if (size > PAGE_SIZE) {
                              int bigidx = pool_bigidx(size);
      
                              palloc = &pool_allocator_big[bigidx];
                              flags |= PR_NOALIGN;
                      } else
                              palloc = &pool_allocator_nointr;
              }
              pool_init(pp, size, align, align_offset, flags, wchan, palloc, ipl);
              mutex_init(&pc->pc_lock, MUTEX_DEFAULT, ipl);
      
              if (ctor == NULL) {
                      ctor = (int (*)(void *, void *, int))nullop;
              }
              if (dtor == NULL) {
                      dtor = (void (*)(void *, void *))nullop;
              }
      
              pc->pc_emptygroups = NULL;
              pc->pc_fullgroups = NULL;
              pc->pc_partgroups = NULL;
              pc->pc_ctor = ctor;
              pc->pc_dtor = dtor;
              pc->pc_arg  = arg;
              pc->pc_hits  = 0;
              pc->pc_misses = 0;
              pc->pc_nempty = 0;
              pc->pc_npart = 0;
              pc->pc_nfull = 0;
              pc->pc_contended = 0;
              pc->pc_refcnt = 0;
              pc->pc_freecheck = NULL;
      
              if ((flags & PR_LARGECACHE) != 0) {
                      pc->pc_pcgsize = PCG_NOBJECTS_LARGE;
                      pc->pc_pcgpool = &pcg_large_pool;
              } else {
                      pc->pc_pcgsize = PCG_NOBJECTS_NORMAL;
                      pc->pc_pcgpool = &pcg_normal_pool;
              }
      
              /* Allocate per-CPU caches. */
              memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus));
              pc->pc_ncpu = 0;
              if (ncpu < 2) {
                      /* XXX For sparc: boot CPU is not attached yet. */
                      pool_cache_cpu_init1(curcpu(), pc);
              } else {
                      for (CPU_INFO_FOREACH(cii, ci)) {
                              pool_cache_cpu_init1(ci, pc);
                      }
              }
      
              /* Add to list of all pools. */
              if (__predict_true(!cold))
                      mutex_enter(&pool_head_lock);
              TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) {
                      if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0)
                              break;
              }
              if (pc1 == NULL)
                      TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist);
              else
                      TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist);
              if (__predict_true(!cold))
                      mutex_exit(&pool_head_lock);
      
              membar_sync();
              pp->pr_cache = pc;
      }
      
      /*
       * pool_cache_destroy:
       *
       *        Destroy a pool cache.
       */
      void
      pool_cache_destroy(pool_cache_t pc)
      {
      
              pool_cache_bootstrap_destroy(pc);
              pool_put(&cache_pool, pc);
      }
      
      /*
       * pool_cache_bootstrap_destroy:
       *
       *        Destroy a pool cache.
       */
      void
      pool_cache_bootstrap_destroy(pool_cache_t pc)
      {
              struct pool *pp = &pc->pc_pool;
              u_int i;
      
              /* Remove it from the global list. */
              mutex_enter(&pool_head_lock);
              while (pc->pc_refcnt != 0)
                      cv_wait(&pool_busy, &pool_head_lock);
              TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist);
              mutex_exit(&pool_head_lock);
      
              /* First, invalidate the entire cache. */
              pool_cache_invalidate(pc);
      
              /* Disassociate it from the pool. */
              mutex_enter(&pp->pr_lock);
              pp->pr_cache = NULL;
              mutex_exit(&pp->pr_lock);
      
              /* Destroy per-CPU data */
              for (i = 0; i < __arraycount(pc->pc_cpus); i++)
                      pool_cache_invalidate_cpu(pc, i);
      
              /* Finally, destroy it. */
              mutex_destroy(&pc->pc_lock);
              pool_destroy(pp);
      }
      
      /*
       * pool_cache_cpu_init1:
       *
       *        Called for each pool_cache whenever a new CPU is attached.
       */
      static void
      pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc)
      {
              pool_cache_cpu_t *cc;
              int index;
      
              index = ci->ci_index;
      
              KASSERT(index < __arraycount(pc->pc_cpus));
      
              if ((cc = pc->pc_cpus[index]) != NULL) {
                      KASSERT(cc->cc_cpuindex == index);
                      return;
              }
      
              /*
               * The first CPU is 'free'.  This needs to be the case for
               * bootstrap - we may not be able to allocate yet.
               */
              if (pc->pc_ncpu == 0) {
                      cc = &pc->pc_cpu0;
                      pc->pc_ncpu = 1;
              } else {
                      mutex_enter(&pc->pc_lock);
                      pc->pc_ncpu++;
                      mutex_exit(&pc->pc_lock);
                      cc = pool_get(&cache_cpu_pool, PR_WAITOK);
              }
      
              cc->cc_ipl = pc->pc_pool.pr_ipl;
              cc->cc_iplcookie = makeiplcookie(cc->cc_ipl);
              cc->cc_cache = pc;
              cc->cc_cpuindex = index;
              cc->cc_hits = 0;
              cc->cc_misses = 0;
              cc->cc_current = __UNCONST(&pcg_dummy);
              cc->cc_previous = __UNCONST(&pcg_dummy);
      
              pc->pc_cpus[index] = cc;
      }
      
      /*
       * pool_cache_cpu_init:
       *
       *        Called whenever a new CPU is attached.
       */
      void
      pool_cache_cpu_init(struct cpu_info *ci)
      {
              pool_cache_t pc;
      
              mutex_enter(&pool_head_lock);
              TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) {
                      pc->pc_refcnt++;
                      mutex_exit(&pool_head_lock);
      
                      pool_cache_cpu_init1(ci, pc);
      
                      mutex_enter(&pool_head_lock);
                      pc->pc_refcnt--;
                      cv_broadcast(&pool_busy);
              }
              mutex_exit(&pool_head_lock);
      }
      
      /*
       * pool_cache_reclaim:
       *
       *        Reclaim memory from a pool cache.
       */
      bool
      pool_cache_reclaim(pool_cache_t pc)
      {
      
              return pool_reclaim(&pc->pc_pool);
      }
      
      static void
      pool_cache_destruct_object1(pool_cache_t pc, void *object)
      {
              (*pc->pc_dtor)(pc->pc_arg, object);
              pool_put(&pc->pc_pool, object);
      }
      
      /*
       * pool_cache_destruct_object:
       *
       *        Force destruction of an object and its release back into
       *        the pool.
       */
      void
      pool_cache_destruct_object(pool_cache_t pc, void *object)
      {
      
  848         FREECHECK_IN(&pc->pc_freecheck, object);
      
              pool_cache_destruct_object1(pc, object);
      }
      
      /*
       * pool_cache_invalidate_groups:
       *
       *        Invalidate a chain of groups and destruct all objects.
       */
      static void
      pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg)
      {
              void *object;
              pcg_t *next;
              int i;
      
              for (; pcg != NULL; pcg = next) {
                      next = pcg->pcg_next;
      
                      for (i = 0; i < pcg->pcg_avail; i++) {
                              object = pcg->pcg_objects[i].pcgo_va;
                              pool_cache_destruct_object1(pc, object);
                      }
      
                      if (pcg->pcg_size == PCG_NOBJECTS_LARGE) {
                              pool_put(&pcg_large_pool, pcg);
                      } else {
                              KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL);
                              pool_put(&pcg_normal_pool, pcg);
                      }
              }
      }
      
      /*
       * pool_cache_invalidate:
       *
       *        Invalidate a pool cache (destruct and release all of the
       *        cached objects).  Does not reclaim objects from the pool.
       *
       *        Note: For pool caches that provide constructed objects, there
       *        is an assumption that another level of synchronization is occurring
       *        between the input to the constructor and the cache invalidation.
       *
       *        Invalidation is a costly process and should not be called from
       *        interrupt context.
       */
      void
      pool_cache_invalidate(pool_cache_t pc)
      {
              uint64_t where;
              pcg_t *full, *empty, *part;
      
              KASSERT(!cpu_intr_p() && !cpu_softintr_p());
      
              if (ncpu < 2 || !mp_online) {
                      /*
                       * We might be called early enough in the boot process
                       * for the CPU data structures to not be fully initialized.
                       * In this case, transfer the content of the local CPU's
                       * cache back into global cache as only this CPU is currently
                       * running.
                       */
                      pool_cache_transfer(pc);
              } else {
                      /*
                       * Signal all CPUs that they must transfer their local
                       * cache back to the global pool then wait for the xcall to
                       * complete.
                       */
                      where = xc_broadcast(0, (xcfunc_t)pool_cache_transfer,
                          pc, NULL);
                      xc_wait(where);
              }
      
              /* Empty pool caches, then invalidate objects */
              mutex_enter(&pc->pc_lock);
              full = pc->pc_fullgroups;
              empty = pc->pc_emptygroups;
              part = pc->pc_partgroups;
              pc->pc_fullgroups = NULL;
              pc->pc_emptygroups = NULL;
              pc->pc_partgroups = NULL;
              pc->pc_nfull = 0;
              pc->pc_nempty = 0;
              pc->pc_npart = 0;
              mutex_exit(&pc->pc_lock);
      
              pool_cache_invalidate_groups(pc, full);
              pool_cache_invalidate_groups(pc, empty);
              pool_cache_invalidate_groups(pc, part);
      }
      
      /*
       * pool_cache_invalidate_cpu:
       *
       *        Invalidate all CPU-bound cached objects in pool cache, the CPU being
       *        identified by its associated index.
       *        It is caller's responsibility to ensure that no operation is
       *        taking place on this pool cache while doing this invalidation.
       *        WARNING: as no inter-CPU locking is enforced, trying to invalidate
       *        pool cached objects from a CPU different from the one currently running
       *        may result in an undefined behaviour.
       */
      static void
      pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
      {
              pool_cache_cpu_t *cc;
              pcg_t *pcg;
      
              if ((cc = pc->pc_cpus[index]) == NULL)
                      return;
      
              if ((pcg = cc->cc_current) != &pcg_dummy) {
                      pcg->pcg_next = NULL;
                      pool_cache_invalidate_groups(pc, pcg);
              }
              if ((pcg = cc->cc_previous) != &pcg_dummy) {
                      pcg->pcg_next = NULL;
                      pool_cache_invalidate_groups(pc, pcg);
              }
              if (cc != &pc->pc_cpu0)
                      pool_put(&cache_cpu_pool, cc);
      
      }
      
      void
      pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg)
      {
      
              pool_set_drain_hook(&pc->pc_pool, fn, arg);
      }
      
      void
      pool_cache_setlowat(pool_cache_t pc, int n)
      {
      
              pool_setlowat(&pc->pc_pool, n);
      }
      
      void
      pool_cache_sethiwat(pool_cache_t pc, int n)
      {
      
              pool_sethiwat(&pc->pc_pool, n);
      }
      
      void
      pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap)
      {
      
              pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap);
      }
      
      static bool __noinline
      pool_cache_get_slow(pool_cache_cpu_t *cc, int s, void **objectp,
                          paddr_t *pap, int flags)
      {
              pcg_t *pcg, *cur;
              uint64_t ncsw;
              pool_cache_t pc;
              void *object;
      
 1041         KASSERT(cc->cc_current->pcg_avail == 0);
 1041         KASSERT(cc->cc_previous->pcg_avail == 0);
      
 1041         pc = cc->cc_cache;
              cc->cc_misses++;
      
              /*
               * Nothing was available locally.  Try and grab a group
               * from the cache.
               */
              if (__predict_false(!mutex_tryenter(&pc->pc_lock))) {
   43                 ncsw = curlwp->l_ncsw;
                      mutex_enter(&pc->pc_lock);
                      pc->pc_contended++;
      
                      /*
                       * If we context switched while locking, then
                       * our view of the per-CPU data is invalid:
                       * retry.
                       */
                      if (curlwp->l_ncsw != ncsw) {
                              mutex_exit(&pc->pc_lock);
                              return true;
                      }
              }
      
 1041         if (__predict_true((pcg = pc->pc_fullgroups) != NULL)) {
                      /*
                       * If there's a full group, release our empty
                       * group back to the cache.  Install the full
                       * group as cc_current and return.
                       */
                      if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) {
                              KASSERT(cur->pcg_avail == 0);
                              cur->pcg_next = pc->pc_emptygroups;
                              pc->pc_emptygroups = cur;
                              pc->pc_nempty++;
                      }
                      KASSERT(pcg->pcg_avail == pcg->pcg_size);
                      cc->cc_current = pcg;
                      pc->pc_fullgroups = pcg->pcg_next;
                      pc->pc_hits++;
                      pc->pc_nfull--;
                      mutex_exit(&pc->pc_lock);
 1041                 return true;
              }
      
              /*
               * Nothing available locally or in cache.  Take the slow
               * path: fetch a new object from the pool and construct
               * it.
               */
 1041         pc->pc_misses++;
              mutex_exit(&pc->pc_lock);
              splx(s);
      
              object = pool_get(&pc->pc_pool, flags);
              *objectp = object;
              if (__predict_false(object == NULL)) {
                      KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT);
                      return false;
              }
      
 1041         if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) {
                      pool_put(&pc->pc_pool, object);
                      *objectp = NULL;
                      return false;
              }
      
 1041         KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0);
      
 1041         if (pap != NULL) {
      #ifdef POOL_VTOPHYS
   28                 *pap = POOL_VTOPHYS(object);
      #else
                      *pap = POOL_PADDR_INVALID;
      #endif
              }
      
 1041         FREECHECK_OUT(&pc->pc_freecheck, object);
              pool_cache_kleak_fill(pc, object);
              return false;
      }
      
      /*
       * pool_cache_get{,_paddr}:
       *
       *        Get an object from a pool cache (optionally returning
       *        the physical address of the object).
       */
      void *
      pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
      {
              pool_cache_cpu_t *cc;
              pcg_t *pcg;
 1041         void *object;
              int s;
      
              KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
 1041         KASSERTMSG((!cpu_intr_p() && !cpu_softintr_p()) ||
                  (pc->pc_pool.pr_ipl != IPL_NONE || cold || panicstr != NULL),
                  "%s: [%s] is IPL_NONE, but called from interrupt context",
                  __func__, pc->pc_pool.pr_wchan);
      
 1041         if (flags & PR_WAITOK) {
  972                 ASSERT_SLEEPABLE();
              }
      
              /* Lock out interrupts and disable preemption. */
 1041         s = splvm();
              while (/* CONSTCOND */ true) {
                      /* Try and allocate an object from the current group. */
 1041                 cc = pc->pc_cpus[curcpu()->ci_index];
                      KASSERT(cc->cc_cache == pc);
 1041                  pcg = cc->cc_current;
                      if (__predict_true(pcg->pcg_avail > 0)) {
                              object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va;
                              if (__predict_false(pap != NULL))
                                      *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa;
      #if defined(DIAGNOSTIC)
                              pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL;
                              KASSERT(pcg->pcg_avail < pcg->pcg_size);
                              KASSERT(object != NULL);
      #endif
                              cc->cc_hits++;
                              splx(s);
                              FREECHECK_OUT(&pc->pc_freecheck, object);
                              pool_redzone_fill(&pc->pc_pool, object);
                              pool_cache_kleak_fill(pc, object);
                              return object;
                      }
      
                      /*
                       * That failed.  If the previous group isn't empty, swap
                       * it with the current group and allocate from there.
                       */
 1041                 pcg = cc->cc_previous;
                      if (__predict_true(pcg->pcg_avail > 0)) {
                              cc->cc_previous = cc->cc_current;
                              cc->cc_current = pcg;
                              continue;
                      }
      
                      /*
                       * Can't allocate from either group: try the slow path.
                       * If get_slow() allocated an object for us, or if
                       * no more objects are available, it will return false.
                       * Otherwise, we need to retry.
                       */
 1041                 if (!pool_cache_get_slow(cc, s, &object, pap, flags))
                              break;
              }
      
              /*
               * We would like to KASSERT(object || (flags & PR_NOWAIT)), but
               * pool_cache_get can fail even in the PR_WAITOK case, if the
               * constructor fails.
               */
 1041         return object;
      }
      
      static bool __noinline
      pool_cache_put_slow(pool_cache_cpu_t *cc, int s, void *object)
      {
              struct lwp *l = curlwp;
              pcg_t *pcg, *cur;
              uint64_t ncsw;
              pool_cache_t pc;
      
              KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size);
              KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size);
      
              pc = cc->cc_cache;
              pcg = NULL;
              cc->cc_misses++;
              ncsw = l->l_ncsw;
      
              /*
               * If there are no empty groups in the cache then allocate one
               * while still unlocked.
               */
              if (__predict_false(pc->pc_emptygroups == NULL)) {
                      if (__predict_true(!pool_cache_disable)) {
                              pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT);
                      }
                      /*
                       * If pool_get() blocked, then our view of
                       * the per-CPU data is invalid: retry.
                       */
                      if (__predict_false(l->l_ncsw != ncsw)) {
                              if (pcg != NULL) {
                                      pool_put(pc->pc_pcgpool, pcg);
                              }
                              return true;
                      }
                      if (__predict_true(pcg != NULL)) {
                              pcg->pcg_avail = 0;
                              pcg->pcg_size = pc->pc_pcgsize;
                      }
              }
      
              /* Lock the cache. */
              if (__predict_false(!mutex_tryenter(&pc->pc_lock))) {
                      mutex_enter(&pc->pc_lock);
                      pc->pc_contended++;
      
                      /*
                       * If we context switched while locking, then our view of
                       * the per-CPU data is invalid: retry.
                       */
                      if (__predict_false(l->l_ncsw != ncsw)) {
                              mutex_exit(&pc->pc_lock);
                              if (pcg != NULL) {
                                      pool_put(pc->pc_pcgpool, pcg);
                              }
                              return true;
                      }
              }
      
              /* If there are no empty groups in the cache then allocate one. */
              if (pcg == NULL && pc->pc_emptygroups != NULL) {
                      pcg = pc->pc_emptygroups;
                      pc->pc_emptygroups = pcg->pcg_next;
                      pc->pc_nempty--;
              }
      
              /*
               * If there's a empty group, release our full group back
               * to the cache.  Install the empty group to the local CPU
               * and return.
               */
              if (pcg != NULL) {
                      KASSERT(pcg->pcg_avail == 0);
                      if (__predict_false(cc->cc_previous == &pcg_dummy)) {
                              cc->cc_previous = pcg;
                      } else {
                              cur = cc->cc_current;
                              if (__predict_true(cur != &pcg_dummy)) {
                                      KASSERT(cur->pcg_avail == cur->pcg_size);
                                      cur->pcg_next = pc->pc_fullgroups;
                                      pc->pc_fullgroups = cur;
                                      pc->pc_nfull++;
                              }
                              cc->cc_current = pcg;
                      }
                      pc->pc_hits++;
                      mutex_exit(&pc->pc_lock);
                      return true;
              }
      
              /*
               * Nothing available locally or in cache, and we didn't
               * allocate an empty group.  Take the slow path and destroy
               * the object here and now.
               */
              pc->pc_misses++;
              mutex_exit(&pc->pc_lock);
              splx(s);
              pool_cache_destruct_object(pc, object);
      
              return false;
      }
      
      /*
       * pool_cache_put{,_paddr}:
       *
       *        Put an object back to the pool cache (optionally caching the
       *        physical address of the object).
       */
      void
      pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa)
      {
              pool_cache_cpu_t *cc;
              pcg_t *pcg;
              int s;
      
  848         KASSERT(object != NULL);
  848         pool_cache_redzone_check(pc, object);
  848         FREECHECK_IN(&pc->pc_freecheck, object);
      
              if (pc->pc_pool.pr_roflags & PR_PHINPAGE) {
  766                 pc_phinpage_check(pc, object);
              }
      
  848         if (pool_cache_put_quarantine(pc, object, pa)) {
                      return;
              }
      
              /* Lock out interrupts and disable preemption. */
              s = splvm();
              while (/* CONSTCOND */ true) {
                      /* If the current group isn't full, release it there. */
                      cc = pc->pc_cpus[curcpu()->ci_index];
                      KASSERT(cc->cc_cache == pc);
                       pcg = cc->cc_current;
                      if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
                              pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object;
                              pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa;
                              pcg->pcg_avail++;
                              cc->cc_hits++;
                              splx(s);
                              return;
                      }
      
                      /*
                       * That failed.  If the previous group isn't full, swap
                       * it with the current group and try again.
                       */
                      pcg = cc->cc_previous;
                      if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
                              cc->cc_previous = cc->cc_current;
                              cc->cc_current = pcg;
                              continue;
                      }
      
                      /*
                       * Can't free to either group: try the slow path.
                       * If put_slow() releases the object for us, it
                       * will return false.  Otherwise we need to retry.
                       */
                      if (!pool_cache_put_slow(cc, s, object))
                              break;
              }
      }
      
      /*
       * pool_cache_transfer:
       *
       *        Transfer objects from the per-CPU cache to the global cache.
       *        Run within a cross-call thread.
       */
      static void
      pool_cache_transfer(pool_cache_t pc)
      {
              pool_cache_cpu_t *cc;
              pcg_t *prev, *cur, **list;
              int s;
      
              s = splvm();
              mutex_enter(&pc->pc_lock);
              cc = pc->pc_cpus[curcpu()->ci_index];
              cur = cc->cc_current;
              cc->cc_current = __UNCONST(&pcg_dummy);
              prev = cc->cc_previous;
              cc->cc_previous = __UNCONST(&pcg_dummy);
              if (cur != &pcg_dummy) {
                      if (cur->pcg_avail == cur->pcg_size) {
                              list = &pc->pc_fullgroups;
                              pc->pc_nfull++;
                      } else if (cur->pcg_avail == 0) {
                              list = &pc->pc_emptygroups;
                              pc->pc_nempty++;
                      } else {
                              list = &pc->pc_partgroups;
                              pc->pc_npart++;
                      }
                      cur->pcg_next = *list;
                      *list = cur;
              }
              if (prev != &pcg_dummy) {
                      if (prev->pcg_avail == prev->pcg_size) {
                              list = &pc->pc_fullgroups;
                              pc->pc_nfull++;
                      } else if (prev->pcg_avail == 0) {
                              list = &pc->pc_emptygroups;
                              pc->pc_nempty++;
                      } else {
                              list = &pc->pc_partgroups;
                              pc->pc_npart++;
                      }
                      prev->pcg_next = *list;
                      *list = prev;
              }
              mutex_exit(&pc->pc_lock);
              splx(s);
      }
      
      static int
      pool_bigidx(size_t size)
      {
              int i;
      
              for (i = 0; i < __arraycount(pool_allocator_big); i++) {
                      if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size)
                              return i;
              }
              panic("pool item size %zu too large, use a custom allocator", size);
      }
      
      static void *
      pool_allocator_alloc(struct pool *pp, int flags)
      {
              struct pool_allocator *pa = pp->pr_alloc;
              void *res;
      
              res = (*pa->pa_alloc)(pp, flags);
              if (res == NULL && (flags & PR_WAITOK) == 0) {
                      /*
                       * We only run the drain hook here if PR_NOWAIT.
                       * In other cases, the hook will be run in
                       * pool_reclaim().
                       */
                      if (pp->pr_drain_hook != NULL) {
                              (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
                              res = (*pa->pa_alloc)(pp, flags);
                      }
              }
              return res;
      }
      
      static void
      pool_allocator_free(struct pool *pp, void *v)
      {
              struct pool_allocator *pa = pp->pr_alloc;
      
              if (pp->pr_redzone) {
                      kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0);
              }
              (*pa->pa_free)(pp, v);
      }
      
      void *
      pool_page_alloc(struct pool *pp, int flags)
      {
   97         const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
   97         vmem_addr_t va;
              int ret;
      
              ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz,
                  vflags | VM_INSTANTFIT, &va);
      
   97         return ret ? NULL : (void *)va;
      }
      
      void
      pool_page_free(struct pool *pp, void *v)
      {
      
              uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
      }
      
      static void *
      pool_page_alloc_meta(struct pool *pp, int flags)
      {
    3         const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
    3         vmem_addr_t va;
              int ret;
      
              ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
                  vflags | VM_INSTANTFIT, &va);
      
    3         return ret ? NULL : (void *)va;
      }
      
      static void
      pool_page_free_meta(struct pool *pp, void *v)
      {
      
              vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
      }
      
      #ifdef KLEAK
      static void
      pool_kleak_fill(struct pool *pp, void *p)
      {
              if (__predict_false(pp->pr_roflags & PR_NOTOUCH)) {
                      return;
              }
              kleak_fill_area(p, pp->pr_size);
      }
      
      static void
      pool_cache_kleak_fill(pool_cache_t pc, void *p)
      {
              if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) {
                      return;
              }
              pool_kleak_fill(&pc->pc_pool, p);
      }
      #endif
      
      #ifdef POOL_QUARANTINE
      static void
      pool_quarantine_init(struct pool *pp)
      {
              pp->pr_quar.rotor = 0;
              memset(&pp->pr_quar, 0, sizeof(pp->pr_quar));
      }
      
      static void
      pool_quarantine_flush(struct pool *pp)
      {
              pool_quar_t *quar = &pp->pr_quar;
              struct pool_pagelist pq;
              size_t i;
      
              LIST_INIT(&pq);
      
              mutex_enter(&pp->pr_lock);
              for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) {
                      if (quar->list[i] == 0)
                              continue;
                      pool_do_put(pp, (void *)quar->list[i], &pq);
              }
              mutex_exit(&pp->pr_lock);
      
              pr_pagelist_free(pp, &pq);
      }
      
      static bool
      pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq)
      {
              pool_quar_t *quar = &pp->pr_quar;
              uintptr_t old;
      
              if (pp->pr_roflags & PR_NOTOUCH) {
                      return false;
              }
      
  792         pool_redzone_check(pp, v);
      
  792         old = quar->list[quar->rotor];
              quar->list[quar->rotor] = (uintptr_t)v;
              quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH;
              if (old != 0) {
  789                 pool_do_put(pp, (void *)old, pq);
              }
      
              return true;
      }
      
      static bool
      pool_cache_put_quarantine(pool_cache_t pc, void *p, paddr_t pa)
      {
  848         pool_cache_destruct_object(pc, p);
              return true;
      }
      #endif
      
      #ifdef POOL_REDZONE
      #if defined(_LP64)
      # define PRIME 0x9e37fffffffc0000UL
      #else /* defined(_LP64) */
      # define PRIME 0x9e3779b1
      #endif /* defined(_LP64) */
      #define STATIC_BYTE        0xFE
      CTASSERT(POOL_REDZONE_SIZE > 1);
      
      #ifndef KASAN
      static inline uint8_t
      pool_pattern_generate(const void *p)
      {
              return (uint8_t)(((uintptr_t)p) * PRIME
                 >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT);
      }
      #endif
      
      static void
      pool_redzone_init(struct pool *pp, size_t requested_size)
      {
              size_t redzsz;
              size_t nsz;
      
      #ifdef KASAN
              redzsz = requested_size;
              kasan_add_redzone(&redzsz);
              redzsz -= requested_size;
      #else
              redzsz = POOL_REDZONE_SIZE;
      #endif
      
              if (pp->pr_roflags & PR_NOTOUCH) {
                      pp->pr_redzone = false;
                      return;
              }
      
              /*
               * We may have extended the requested size earlier; check if
               * there's naturally space in the padding for a red zone.
               */
              if (pp->pr_size - requested_size >= redzsz) {
                      pp->pr_reqsize_with_redzone = requested_size + redzsz;
                      pp->pr_redzone = true;
                      return;
              }
      
              /*
               * No space in the natural padding; check if we can extend a
               * bit the size of the pool.
               */
              nsz = roundup(pp->pr_size + redzsz, pp->pr_align);
              if (nsz <= pp->pr_alloc->pa_pagesz) {
                      /* Ok, we can */
                      pp->pr_size = nsz;
                      pp->pr_reqsize_with_redzone = requested_size + redzsz;
                      pp->pr_redzone = true;
              } else {
                      /* No space for a red zone... snif :'( */
                      pp->pr_redzone = false;
                      printf("pool redzone disabled for '%s'\n", pp->pr_wchan);
              }
      }
      
      static void
      pool_redzone_fill(struct pool *pp, void *p)
      {
              if (!pp->pr_redzone)
                      return;
      #ifdef KASAN
 1050         kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone,
                  KASAN_POOL_REDZONE);
      #else
              uint8_t *cp, pat;
              const uint8_t *ep;
      
              cp = (uint8_t *)p + pp->pr_reqsize;
              ep = cp + POOL_REDZONE_SIZE;
      
              /*
               * We really don't want the first byte of the red zone to be '\0';
               * an off-by-one in a string may not be properly detected.
               */
              pat = pool_pattern_generate(cp);
              *cp = (pat == '\0') ? STATIC_BYTE: pat;
              cp++;
      
              while (cp < ep) {
                      *cp = pool_pattern_generate(cp);
                      cp++;
              }
      #endif
      }
      
      static void
      pool_redzone_check(struct pool *pp, void *p)
      {
  848         if (!pp->pr_redzone)
                      return;
      #ifdef KASAN
  781         kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED);
      #else
              uint8_t *cp, pat, expected;
              const uint8_t *ep;
      
              cp = (uint8_t *)p + pp->pr_reqsize;
              ep = cp + POOL_REDZONE_SIZE;
      
              pat = pool_pattern_generate(cp);
              expected = (pat == '\0') ? STATIC_BYTE: pat;
              if (__predict_false(expected != *cp)) {
                      printf("%s: %p: 0x%02x != 0x%02x\n",
                         __func__, cp, *cp, expected);
              }
              cp++;
      
              while (cp < ep) {
                      expected = pool_pattern_generate(cp);
                      if (__predict_false(*cp != expected)) {
                              printf("%s: %p: 0x%02x != 0x%02x\n",
                                 __func__, cp, *cp, expected);
                      }
                      cp++;
              }
      #endif
      }
      
      static void
      pool_cache_redzone_check(pool_cache_t pc, void *p)
      {
      #ifdef KASAN
              /* If there is a ctor/dtor, leave the data as valid. */
  848         if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) {
                      return;
              }
      #endif
  723         pool_redzone_check(&pc->pc_pool, p);
      }
      
      #endif /* POOL_REDZONE */
      
      #if defined(DDB)
      static bool
      pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
      {
      
              return (uintptr_t)ph->ph_page <= addr &&
                  addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz;
      }
      
      static bool
      pool_in_item(struct pool *pp, void *item, uintptr_t addr)
      {
      
              return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size;
      }
      
      static bool
      pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr)
      {
              int i;
      
              if (pcg == NULL) {
                      return false;
              }
              for (i = 0; i < pcg->pcg_avail; i++) {
                      if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) {
                              return true;
                      }
              }
              return false;
      }
      
      static bool
      pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
      {
      
              if ((pp->pr_roflags & PR_USEBMAP) != 0) {
                      unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr);
                      pool_item_bitmap_t *bitmap =
                          ph->ph_bitmap + (idx / BITMAP_SIZE);
                      pool_item_bitmap_t mask = 1 << (idx & BITMAP_MASK);
      
                      return (*bitmap & mask) == 0;
              } else {
                      struct pool_item *pi;
      
                      LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
                              if (pool_in_item(pp, pi, addr)) {
                                      return false;
                              }
                      }
                      return true;
              }
      }
      
      void
      pool_whatis(uintptr_t addr, void (*pr)(const char *, ...))
      {
              struct pool *pp;
      
              TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                      struct pool_item_header *ph;
                      uintptr_t item;
                      bool allocated = true;
                      bool incache = false;
                      bool incpucache = false;
                      char cpucachestr[32];
      
                      if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
                              LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
                                      if (pool_in_page(pp, ph, addr)) {
                                              goto found;
                                      }
                              }
                              LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
                                      if (pool_in_page(pp, ph, addr)) {
                                              allocated =
                                                  pool_allocated(pp, ph, addr);
                                              goto found;
                                      }
                              }
                              LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
                                      if (pool_in_page(pp, ph, addr)) {
                                              allocated = false;
                                              goto found;
                                      }
                              }
                              continue;
                      } else {
                              ph = pr_find_pagehead_noalign(pp, (void *)addr);
                              if (ph == NULL || !pool_in_page(pp, ph, addr)) {
                                      continue;
                              }
                              allocated = pool_allocated(pp, ph, addr);
                      }
      found:
                      if (allocated && pp->pr_cache) {
                              pool_cache_t pc = pp->pr_cache;
                              struct pool_cache_group *pcg;
                              int i;
      
                              for (pcg = pc->pc_fullgroups; pcg != NULL;
                                  pcg = pcg->pcg_next) {
                                      if (pool_in_cg(pp, pcg, addr)) {
                                              incache = true;
                                              goto print;
                                      }
                              }
                              for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
                                      pool_cache_cpu_t *cc;
      
                                      if ((cc = pc->pc_cpus[i]) == NULL) {
                                              continue;
                                      }
                                      if (pool_in_cg(pp, cc->cc_current, addr) ||
                                          pool_in_cg(pp, cc->cc_previous, addr)) {
                                              struct cpu_info *ci =
                                                  cpu_lookup(i);
      
                                              incpucache = true;
                                              snprintf(cpucachestr,
                                                  sizeof(cpucachestr),
                                                  "cached by CPU %u",
                                                  ci->ci_index);
                                              goto print;
                                      }
                              }
                      }
      print:
                      item = (uintptr_t)ph->ph_page + ph->ph_off;
                      item = item + rounddown(addr - item, pp->pr_size);
                      (*pr)("%p is %p+%zu in POOL '%s' (%s)\n",
                          (void *)addr, item, (size_t)(addr - item),
                          pp->pr_wchan,
                          incpucache ? cpucachestr :
                          incache ? "cached" : allocated ? "allocated" : "free");
              }
      }
      #endif /* defined(DDB) */
      
      static int
      pool_sysctl(SYSCTLFN_ARGS)
      {
              struct pool_sysctl data;
              struct pool *pp;
              struct pool_cache *pc;
              pool_cache_cpu_t *cc;
              int error;
              size_t i, written;
      
              if (oldp == NULL) {
                      *oldlenp = 0;
                      TAILQ_FOREACH(pp, &pool_head, pr_poollist)
                              *oldlenp += sizeof(data);
                      return 0;
              }
      
              memset(&data, 0, sizeof(data));
              error = 0;
              written = 0;
              TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                      if (written + sizeof(data) > *oldlenp)
                              break;
                      strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan));
                      data.pr_pagesize = pp->pr_alloc->pa_pagesz;
                      data.pr_flags = pp->pr_roflags | pp->pr_flags;
      #define COPY(field) data.field = pp->field
                      COPY(pr_size);
      
                      COPY(pr_itemsperpage);
                      COPY(pr_nitems);
                      COPY(pr_nout);
                      COPY(pr_hardlimit);
                      COPY(pr_npages);
                      COPY(pr_minpages);
                      COPY(pr_maxpages);
      
                      COPY(pr_nget);
                      COPY(pr_nfail);
                      COPY(pr_nput);
                      COPY(pr_npagealloc);
                      COPY(pr_npagefree);
                      COPY(pr_hiwat);
                      COPY(pr_nidle);
      #undef COPY
      
                      data.pr_cache_nmiss_pcpu = 0;
                      data.pr_cache_nhit_pcpu = 0;
                      if (pp->pr_cache) {
                              pc = pp->pr_cache;
                              data.pr_cache_meta_size = pc->pc_pcgsize;
                              data.pr_cache_nfull = pc->pc_nfull;
                              data.pr_cache_npartial = pc->pc_npart;
                              data.pr_cache_nempty = pc->pc_nempty;
                              data.pr_cache_ncontended = pc->pc_contended;
                              data.pr_cache_nmiss_global = pc->pc_misses;
                              data.pr_cache_nhit_global = pc->pc_hits;
                              for (i = 0; i < pc->pc_ncpu; ++i) {
                                      cc = pc->pc_cpus[i];
                                      if (cc == NULL)
                                              continue;
                                      data.pr_cache_nmiss_pcpu += cc->cc_misses;
                                      data.pr_cache_nhit_pcpu += cc->cc_hits;
                              }
                      } else {
                              data.pr_cache_meta_size = 0;
                              data.pr_cache_nfull = 0;
                              data.pr_cache_npartial = 0;
                              data.pr_cache_nempty = 0;
                              data.pr_cache_ncontended = 0;
                              data.pr_cache_nmiss_global = 0;
                              data.pr_cache_nhit_global = 0;
                      }
      
                      error = sysctl_copyout(l, &data, oldp, sizeof(data));
                      if (error)
                              break;
                      written += sizeof(data);
                      oldp = (char *)oldp + sizeof(data);
              }
      
              *oldlenp = written;
              return error;
      }
      
      SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup")
      {
              const struct sysctlnode *rnode = NULL;
      
              sysctl_createv(clog, 0, NULL, &rnode,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_STRUCT, "pool",
                             SYSCTL_DESCR("Get pool statistics"),
                             pool_sysctl, 0, NULL, 0,
                             CTL_KERN, CTL_CREATE, CTL_EOL);
      }
      /*        $NetBSD: ufs_bmap.c,v 1.52 2017/03/18 05:33:06 riastradh Exp $        */
      
      /*
       * Copyright (c) 1989, 1991, 1993
       *        The Regents of the University of California.  All rights reserved.
       * (c) UNIX System Laboratories, Inc.
       * All or some portions of this file are derived from material licensed
       * to the University of California by American Telephone and Telegraph
       * Co. or Unix System Laboratories, Inc. and are reproduced herein with
       * the permission of UNIX System Laboratories, Inc.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)ufs_bmap.c        8.8 (Berkeley) 8/11/95
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.52 2017/03/18 05:33:06 riastradh Exp $");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/buf.h>
      #include <sys/proc.h>
      #include <sys/vnode.h>
      #include <sys/mount.h>
      #include <sys/resourcevar.h>
      #include <sys/trace.h>
      
      #include <miscfs/specfs/specdev.h>
      
      #include <ufs/ufs/inode.h>
      #include <ufs/ufs/ufsmount.h>
      #include <ufs/ufs/ufs_extern.h>
      #include <ufs/ufs/ufs_bswap.h>
      
      static bool
      ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1)
      {
      
              /* for ufs, blocks in a hole is not 'contiguous'. */
   71         if (daddr0 == 0)
                      return false;
      
   71         return (daddr0 + ump->um_seqinc == daddr1);
      }
      
      /*
       * Bmap converts the logical block number of a file to its physical block
       * number on the disk. The conversion is done by using the logical block
       * number to index into the array of block pointers described by the dinode.
       */
      int
      ufs_bmap(void *v)
      {
              struct vop_bmap_args /* {
                      struct vnode *a_vp;
                      daddr_t  a_bn;
                      struct vnode **a_vpp;
                      daddr_t *a_bnp;
                      int *a_runp;
              } */ *ap = v;
              int error;
      
              /*
               * Check for underlying vnode requests and ensure that logical
               * to physical mapping is requested.
               */
   71         if (ap->a_vpp != NULL)
   71                 *ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
   71         if (ap->a_bnp == NULL)
                      return (0);
      
   71         error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
                  ap->a_runp, ufs_issequential);
              return error;
      }
      
      /*
       * Indirect blocks are now on the vnode for the file.  They are given negative
       * logical block numbers.  Indirect blocks are addressed by the negative
       * address of the first data block to which they point.  Double indirect blocks
       * are addressed by one less than the address of the first indirect block to
       * which they point.  Triple indirect blocks are addressed by one less than
       * the address of the first double indirect block to which they point.
       *
       * ufs_bmaparray does the bmap conversion, and if requested returns the
       * array of logical blocks which must be traversed to get to a block.
       * Each entry contains the offset into that block that gets you to the
       * next block and the disk address of the block (if it is assigned).
       */
      
      int
      ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
          int *nump, int *runp, ufs_issequential_callback_t is_sequential)
      {
              struct inode *ip;
              struct buf *bp, *cbp;
              struct ufsmount *ump;
              struct mount *mp;
   71         struct indir a[UFS_NIADDR + 1], *xap;
              daddr_t daddr;
              daddr_t metalbn;
              int error, maxrun = 0, num;
      
              ip = VTOI(vp);
              mp = vp->v_mount;
              ump = ip->i_ump;
              KASSERTMSG(((ap == NULL) == (nump == NULL)),
                  "ufs_bmaparray: invalid arguments: ap = %p, nump = %p", ap, nump);
      
   71         if (runp) {
                      /*
                       * XXX
                       * If MAXBSIZE is the largest transfer the disks can handle,
                       * we probably want maxrun to be 1 block less so that we
                       * don't create a block larger than the device can handle.
                       */
   71                 *runp = 0;
                      maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
              }
      
   71         if (bn >= 0 && bn < UFS_NDADDR) {
   70                 if (nump != NULL)
                              *nump = 0;
   70                 if (ump->um_fstype == UFS1)
                              daddr = ufs_rw32(ip->i_ffs1_db[bn],
                                  UFS_MPNEEDSWAP(ump));
                      else
   70                         daddr = ufs_rw64(ip->i_ffs2_db[bn],
                                  UFS_MPNEEDSWAP(ump));
   70                 *bnp = blkptrtodb(ump, daddr);
                      /*
                       * Since this is FFS independent code, we are out of
                       * scope for the definitions of BLK_NOCOPY and
                       * BLK_SNAP, but we do know that they will fall in
                       * the range 1..um_seqinc, so we use that test and
                       * return a request for a zeroed out buffer if attempts
                       * are made to read a BLK_NOCOPY or BLK_SNAP block.
                       */
                      if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
                          && daddr > 0 &&
                          daddr < ump->um_seqinc) {
    2                         *bnp = -1;
   70                 } else if (*bnp == 0) {
                              if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
                                  == SF_SNAPSHOT) {
                                      *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
                              } else {
                                      *bnp = -1;
                              }
   70                 } else if (runp) {
   70                         if (ump->um_fstype == UFS1) {
                                      for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
                                          is_sequential(ump,
                                              ufs_rw32(ip->i_ffs1_db[bn - 1],
                                                  UFS_MPNEEDSWAP(ump)),
                                              ufs_rw32(ip->i_ffs1_db[bn],
                                                  UFS_MPNEEDSWAP(ump)));
                                          ++bn, ++*runp);
                              } else {
   70                                 for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
   70                                     is_sequential(ump,
                                              ufs_rw64(ip->i_ffs2_db[bn - 1],
                                                  UFS_MPNEEDSWAP(ump)),
                                              ufs_rw64(ip->i_ffs2_db[bn],
   70                                             UFS_MPNEEDSWAP(ump)));
   54                                     ++bn, ++*runp);
                              }
                      }
   71                 return (0);
              }
      
    8         xap = ap == NULL ? a : ap;
    8         if (!nump)
                      nump = &num;
    8         if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
                      return (error);
      
    8         num = *nump;
      
              /* Get disk address out of indirect block array */
              if (ump->um_fstype == UFS1)
                      daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
                          UFS_MPNEEDSWAP(ump));
              else
    8                 daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
                          UFS_MPNEEDSWAP(ump));
      
    8         for (bp = NULL, ++xap; --num; ++xap) {
                      /*
                       * Exit the loop if there is no disk address assigned yet and
                       * the indirect block isn't in the cache, or if we were
                       * looking for an indirect block and we've found it.
                       */
      
    8                 metalbn = xap->in_lbn;
                      if (metalbn == bn)
                              break;
    8                 if (daddr == 0) {
                              mutex_enter(&bufcache_lock);
                              cbp = incore(vp, metalbn);
                              mutex_exit(&bufcache_lock);
                              if (cbp == NULL)
                                      break;
                      }
      
                      /*
                       * If we get here, we've either got the block in the cache
                       * or we have a disk address for it, go fetch it.
                       */
    8                 if (bp)
                              brelse(bp, 0);
      
    8                 xap->in_exists = 1;
                      bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
                      if (bp == NULL) {
      
                              /*
                               * getblk() above returns NULL only iff we are
                               * pagedaemon.  See the implementation of getblk
                               * for detail.
                               */
      
                              return (ENOMEM);
                      }
    8                 if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
                              trace(TR_BREADHIT, pack(vp, size), metalbn);
                      } else {
                              KASSERTMSG((daddr != 0),
                                  "ufs_bmaparray: indirect block not in cache");
                              trace(TR_BREADMISS, pack(vp, size), metalbn);
                              bp->b_blkno = blkptrtodb(ump, daddr);
                              bp->b_flags |= B_READ;
                              BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
                              VOP_STRATEGY(vp, bp);
                              curlwp->l_ru.ru_inblock++;        /* XXX */
                              if ((error = biowait(bp)) != 0) {
                                      brelse(bp, 0);
                                      return (error);
                              }
                      }
    8                 if (ump->um_fstype == UFS1) {
    8                         daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
                                  UFS_MPNEEDSWAP(ump));
                              if (num == 1 && daddr && runp) {
                                      for (bn = xap->in_off + 1;
                                          bn < MNINDIR(ump) && *runp < maxrun &&
                                          is_sequential(ump,
                                              ufs_rw32(((int32_t *)bp->b_data)[bn-1],
                                                  UFS_MPNEEDSWAP(ump)),
                                              ufs_rw32(((int32_t *)bp->b_data)[bn],
                                                  UFS_MPNEEDSWAP(ump)));
                                          ++bn, ++*runp);
                              }
                      } else {
    8                         daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
                                  UFS_MPNEEDSWAP(ump));
    8                         if (num == 1 && daddr && runp) {
    8                                 for (bn = xap->in_off + 1;
    8                                     bn < MNINDIR(ump) && *runp < maxrun &&
    8                                     is_sequential(ump,
                                              ufs_rw64(((int64_t *)bp->b_data)[bn-1],
                                                  UFS_MPNEEDSWAP(ump)),
                                              ufs_rw64(((int64_t *)bp->b_data)[bn],
    8                                             UFS_MPNEEDSWAP(ump)));
    6                                     ++bn, ++*runp);
                              }
                      }
              }
              if (bp)
    8                 brelse(bp, 0);
      
              /*
               * Since this is FFS independent code, we are out of scope for the
               * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
               * will fall in the range 1..um_seqinc, so we use that test and
               * return a request for a zeroed out buffer if attempts are made
               * to read a BLK_NOCOPY or BLK_SNAP block.
               */
    8         if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
                  && daddr > 0 && daddr < ump->um_seqinc) {
                      *bnp = -1;
                      return (0);
              }
    8         *bnp = blkptrtodb(ump, daddr);
              if (*bnp == 0) {
                      if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
                          == SF_SNAPSHOT) {
                              *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
                      } else {
                              *bnp = -1;
                      }
              }
              return (0);
      }
      
      /*
       * Create an array of logical block number/offset pairs which represent the
       * path of indirect blocks required to access a data block.  The first "pair"
       * contains the logical block number of the appropriate single, double or
       * triple indirect block and the offset into the inode indirect block array.
       * Note, the logical block number of the inode single/double/triple indirect
       * block appears twice in the array, once with the offset into the i_ffs1_ib and
       * once with the offset into the page itself.
       */
      int
      ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
      {
              daddr_t metalbn, realbn;
              struct ufsmount *ump;
              int64_t blockcnt;
              int lbc;
              int i, numlevels, off;
      
   10         ump = VFSTOUFS(vp->v_mount);
              if (nump)
   10                 *nump = 0;
              numlevels = 0;
              realbn = bn;
              if (bn < 0)
                      bn = -bn;
              KASSERT(bn >= UFS_NDADDR);
      
              /*
               * Determine the number of levels of indirection.  After this loop
               * is done, blockcnt indicates the number of data blocks possible
               * at the given level of indirection, and UFS_NIADDR - i is the number
               * of levels of indirection needed to locate the requested block.
               */
      
   10         bn -= UFS_NDADDR;
              for (lbc = 0, i = UFS_NIADDR;; i--, bn -= blockcnt) {
                      if (i == 0)
   10                         return (EFBIG);
      
   10                 lbc += ump->um_lognindir;
                      blockcnt = (int64_t)1 << lbc;
      
                      if (bn < blockcnt)
                              break;
              }
      
              /* Calculate the address of the first meta-block. */
   10         metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + UFS_NIADDR - i);
      
              /*
               * At each iteration, off is the offset into the bap array which is
               * an array of disk addresses at the current level of indirection.
               * The logical block number and the offset in that block are stored
               * into the argument array.
               */
              ap->in_lbn = metalbn;
              ap->in_off = off = UFS_NIADDR - i;
              ap->in_exists = 0;
              ap++;
              for (++numlevels; i <= UFS_NIADDR; i++) {
                      /* If searching for a meta-data block, quit when found. */
                      if (metalbn == realbn)
                              break;
      
   10                 lbc -= ump->um_lognindir;
                      off = (bn >> lbc) & (MNINDIR(ump) - 1);
      
                      ++numlevels;
                      ap->in_lbn = metalbn;
                      ap->in_off = off;
                      ap->in_exists = 0;
                      ++ap;
      
                      metalbn -= -1 + ((int64_t)off << lbc);
              }
   10         if (nump)
   10                 *nump = numlevels;
              return (0);
      }
      /*        $NetBSD: kern_rwlock.c,v 1.54 2019/05/09 05:00:31 ozaki-r Exp $        */
      
      /*-
       * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Jason R. Thorpe and Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Kernel reader/writer lock implementation, modeled after those
       * found in Solaris, a description of which can be found in:
       *
       *        Solaris Internals: Core Kernel Architecture, Jim Mauro and
       *            Richard McDougall.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.54 2019/05/09 05:00:31 ozaki-r Exp $");
      
      #define        __RWLOCK_PRIVATE
      
      #include <sys/param.h>
      #include <sys/proc.h>
      #include <sys/rwlock.h>
      #include <sys/sched.h>
      #include <sys/sleepq.h>
      #include <sys/systm.h>
      #include <sys/lockdebug.h>
      #include <sys/cpu.h>
      #include <sys/atomic.h>
      #include <sys/lock.h>
      #include <sys/pserialize.h>
      
      #include <dev/lockstat.h>
      
      /*
       * LOCKDEBUG
       */
      
      #if defined(LOCKDEBUG)
      
      #define        RW_WANTLOCK(rw, op)                                                \
              LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw),                        \
                  (uintptr_t)__builtin_return_address(0), op == RW_READER);
      #define        RW_LOCKED(rw, op)                                                \
              LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL,                        \
                  (uintptr_t)__builtin_return_address(0), op == RW_READER);
      #define        RW_UNLOCKED(rw, op)                                                \
              LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw),                        \
                  (uintptr_t)__builtin_return_address(0), op == RW_READER);
      #define        RW_DASSERT(rw, cond)                                                \
      do {                                                                        \
              if (__predict_false(!(cond)))                                        \
                      rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
      } while (/* CONSTCOND */ 0);
      
      #else        /* LOCKDEBUG */
      
      #define        RW_WANTLOCK(rw, op)        /* nothing */
      #define        RW_LOCKED(rw, op)        /* nothing */
      #define        RW_UNLOCKED(rw, op)        /* nothing */
      #define        RW_DASSERT(rw, cond)        /* nothing */
      
      #endif        /* LOCKDEBUG */
      
      /*
       * DIAGNOSTIC
       */
      
      #if defined(DIAGNOSTIC)
      
      #define        RW_ASSERT(rw, cond)                                                \
      do {                                                                        \
              if (__predict_false(!(cond)))                                        \
                      rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
      } while (/* CONSTCOND */ 0)
      
      #else
      
      #define        RW_ASSERT(rw, cond)        /* nothing */
      
      #endif        /* DIAGNOSTIC */
      
      #define        RW_SETDEBUG(rw, on)                ((rw)->rw_owner |= (on) ? 0 : RW_NODEBUG)
      #define        RW_DEBUG_P(rw)                        (((rw)->rw_owner & RW_NODEBUG) == 0)
      #if defined(LOCKDEBUG)
      #define        RW_INHERITDEBUG(n, o)                (n) |= (o) & RW_NODEBUG
      #else /* defined(LOCKDEBUG) */
      #define        RW_INHERITDEBUG(n, o)                /* nothing */
      #endif /* defined(LOCKDEBUG) */
      
      static void        rw_abort(const char *, size_t, krwlock_t *, const char *);
      static void        rw_dump(const volatile void *, lockop_printer_t);
      static lwp_t        *rw_owner(wchan_t);
      
      static inline uintptr_t
      rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
      {
      
  748         RW_INHERITDEBUG(n, o);
              return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
                  (void *)o, (void *)n);
      }
      
      static inline void
      rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
      {
      
   82         RW_INHERITDEBUG(n, o);
              n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
                  (void *)n);
              RW_DASSERT(rw, n == o);
      }
      
      /*
       * For platforms that do not provide stubs, or for the LOCKDEBUG case.
       */
      #ifdef LOCKDEBUG
      #undef        __HAVE_RW_STUBS
      #endif
      
      #ifndef __HAVE_RW_STUBS
      __strong_alias(rw_enter,rw_vector_enter);
      __strong_alias(rw_exit,rw_vector_exit);
      __strong_alias(rw_tryenter,rw_vector_tryenter);
      #endif
      
      lockops_t rwlock_lockops = {
              .lo_name = "Reader / writer lock",
              .lo_type = LOCKOPS_SLEEP,
              .lo_dump = rw_dump,
      };
      
      syncobj_t rw_syncobj = {
              .sobj_flag        = SOBJ_SLEEPQ_SORTED,
              .sobj_unsleep        = turnstile_unsleep,
              .sobj_changepri        = turnstile_changepri,
              .sobj_lendpri        = sleepq_lendpri,
              .sobj_owner        = rw_owner,
      };
      
      /*
       * rw_dump:
       *
       *        Dump the contents of a rwlock structure.
       */
      static void
      rw_dump(const volatile void *cookie, lockop_printer_t pr)
      {
              const volatile krwlock_t *rw = cookie;
      
              pr("owner/count  : %#018lx flags    : %#018x\n",
                  (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
      }
      
      /*
       * rw_abort:
       *
       *        Dump information about an error and panic the system.  This
       *        generates a lot of machine code in the DIAGNOSTIC case, so
       *        we ask the compiler to not inline it.
       */
      static void __noinline
      rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
      {
      
              if (panicstr != NULL)
                      return;
      
              LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
      }
      
      /*
       * rw_init:
       *
       *        Initialize a rwlock for use.
       */
      void _rw_init(krwlock_t *, uintptr_t);
      void
      _rw_init(krwlock_t *rw, uintptr_t return_address)
      {
              bool dodebug;
      
  171         memset(rw, 0, sizeof(*rw));
      
              dodebug = LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address);
  171         RW_SETDEBUG(rw, dodebug);
      }
      
      void
      rw_init(krwlock_t *rw)
      {
      
  171         _rw_init(rw, (uintptr_t)__builtin_return_address(0));
      }
      
      /*
       * rw_destroy:
       *
       *        Tear down a rwlock.
       */
      void
      rw_destroy(krwlock_t *rw)
      {
      
   26         RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0);
   26         LOCKDEBUG_FREE(RW_DEBUG_P(rw), rw);
      }
      
      /*
       * rw_oncpu:
       *
       *        Return true if an rwlock owner is running on a CPU in the system.
       *        If the target is waiting on the kernel big lock, then we must
       *        release it.  This is necessary to avoid deadlock.
       */
      static bool
      rw_oncpu(uintptr_t owner)
      {
      #ifdef MULTIPROCESSOR
              struct cpu_info *ci;
              lwp_t *l;
      
  163         KASSERT(kpreempt_disabled());
      
  163         if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
  163                 return false;
              }
      
              /*
               * See lwp_dtor() why dereference of the LWP pointer is safe.
               * We must have kernel preemption disabled for that.
               */
  109         l = (lwp_t *)(owner & RW_THREAD);
              ci = l->l_cpu;
      
  109         if (ci && ci->ci_curlwp == l) {
                      /* Target is running; do we need to block? */
   32                 return (ci->ci_biglock_wanted != l);
              }
      #endif
              /* Not running.  It may be safe to block now. */
              return false;
      }
      
      /*
       * rw_vector_enter:
       *
       *        Acquire a rwlock.
       */
      void
      rw_vector_enter(krwlock_t *rw, const krw_t op)
      {
              uintptr_t owner, incr, need_wait, set_wait, curthread, next;
              turnstile_t *ts;
              int queue;
              lwp_t *l;
              LOCKSTAT_TIMER(slptime);
              LOCKSTAT_TIMER(slpcnt);
              LOCKSTAT_TIMER(spintime);
              LOCKSTAT_COUNTER(spincnt);
              LOCKSTAT_FLAG(lsflag);
      
  724         l = curlwp;
              curthread = (uintptr_t)l;
      
              RW_ASSERT(rw, !cpu_intr_p());
  724         RW_ASSERT(rw, curthread != 0);
  724         RW_WANTLOCK(rw, op);
      
  724         if (panicstr == NULL) {
  724                 KDASSERT(pserialize_not_in_read_section());
  724                 LOCKDEBUG_BARRIER(&kernel_lock, 1);
              }
      
              /*
               * We play a slight trick here.  If we're a reader, we want
               * increment the read count.  If we're a writer, we want to
               * set the owner field and the WRITE_LOCKED bit.
               *
               * In the latter case, we expect those bits to be zero,
               * therefore we can use an add operation to set them, which
               * means an add operation for both cases.
               */
  724         if (__predict_true(op == RW_READER)) {
                      incr = RW_READ_INCR;
                      set_wait = RW_HAS_WAITERS;
                      need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
                      queue = TS_READER_Q;
              } else {
  607                 RW_DASSERT(rw, op == RW_WRITER);
  607                 incr = curthread | RW_WRITE_LOCKED;
                      set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
                      need_wait = RW_WRITE_LOCKED | RW_THREAD;
                      queue = TS_WRITER_Q;
              }
      
  724         LOCKSTAT_ENTER(lsflag);
      
  724         KPREEMPT_DISABLE(curlwp);
              for (owner = rw->rw_owner; ;) {
                      /*
                       * Read the lock owner field.  If the need-to-wait
                       * indicator is clear, then try to acquire the lock.
                       */
  724                 if ((owner & need_wait) == 0) {
  724                         next = rw_cas(rw, owner, (owner + incr) &
                                  ~RW_WRITE_WANTED);
                              if (__predict_true(next == owner)) {
                                      /* Got it! */
  724                                 membar_enter();
                                      break;
                              }
      
                              /*
                               * Didn't get it -- spin around again (we'll
                               * probably sleep on the next iteration).
                               */
                              owner = next;
                              continue;
                      }
  163                 if (__predict_false(panicstr != NULL)) {
                              KPREEMPT_ENABLE(curlwp);
                              return;
                      }
  163                 if (__predict_false(RW_OWNER(rw) == curthread)) {
                              rw_abort(__func__, __LINE__, rw,
                                  "locking against myself");
                      }
                      /*
                       * If the lock owner is running on another CPU, and
                       * there are no existing waiters, then spin.
                       */
  163                 if (rw_oncpu(owner)) {
   32                         LOCKSTAT_START_TIMER(lsflag, spintime);
                              u_int count = SPINLOCK_BACKOFF_MIN;
                              do {
   32                                 KPREEMPT_ENABLE(curlwp);
   32                                 SPINLOCK_BACKOFF(count);
   32                                 KPREEMPT_DISABLE(curlwp);
                                      owner = rw->rw_owner;
                              } while (rw_oncpu(owner));
   32                         LOCKSTAT_STOP_TIMER(lsflag, spintime);
   32                         LOCKSTAT_COUNT(spincnt, 1);
                              if ((owner & need_wait) == 0)
                                      continue;
                      }
      
                      /*
                       * Grab the turnstile chain lock.  Once we have that, we
                       * can adjust the waiter bits and sleep queue.
                       */
  163                 ts = turnstile_lookup(rw);
      
                      /*
                       * Mark the rwlock as having waiters.  If the set fails,
                       * then we may not need to sleep and should spin again.
                       * Reload rw_owner because turnstile_lookup() may have
                       * spun on the turnstile chain lock.
                       */
                      owner = rw->rw_owner;
  162                 if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
    5                         turnstile_exit(rw);
                              continue;
                      }
  162                 next = rw_cas(rw, owner, owner | set_wait);
                      if (__predict_false(next != owner)) {
                              turnstile_exit(rw);
                              owner = next;
                              continue;
                      }
      
  162                 LOCKSTAT_START_TIMER(lsflag, slptime);
  162                 turnstile_block(ts, queue, rw, &rw_syncobj);
                      LOCKSTAT_STOP_TIMER(lsflag, slptime);
  162                 LOCKSTAT_COUNT(slpcnt, 1);
      
                      /*
                       * No need for a memory barrier because of context switch.
                       * If not handed the lock, then spin again.
                       */
  149                 if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
                              break;
      
  146                 owner = rw->rw_owner;
              }
  724         KPREEMPT_ENABLE(curlwp);
      
  724         LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK |
                  (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime);
              LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime);
              LOCKSTAT_EXIT(lsflag);
      
  724         RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
                  (op == RW_READER && RW_COUNT(rw) != 0));
  724         RW_LOCKED(rw, op);
      }
      
      /*
       * rw_vector_exit:
       *
       *        Release a rwlock.
       */
      void
      rw_vector_exit(krwlock_t *rw)
      {
              uintptr_t curthread, owner, decr, newown, next;
              turnstile_t *ts;
              int rcnt, wcnt;
              lwp_t *l;
      
  749         curthread = (uintptr_t)curlwp;
              RW_ASSERT(rw, curthread != 0);
      
  749         if (__predict_false(panicstr != NULL))
                      return;
      
              /*
               * Again, we use a trick.  Since we used an add operation to
               * set the required lock bits, we can use a subtract to clear
               * them, which makes the read-release and write-release path
               * the same.
               */
  749         owner = rw->rw_owner;
              if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
  633                 RW_UNLOCKED(rw, RW_WRITER);
                      RW_ASSERT(rw, RW_OWNER(rw) == curthread);
  633                 decr = curthread | RW_WRITE_LOCKED;
              } else {
  652                 RW_UNLOCKED(rw, RW_READER);
  652                 RW_ASSERT(rw, RW_COUNT(rw) != 0);
                      decr = RW_READ_INCR;
              }
      
              /*
               * Compute what we expect the new value of the lock to be. Only
               * proceed to do direct handoff if there are waiters, and if the
               * lock would become unowned.
               */
  749         membar_exit();
              for (;;) {
  749                 newown = (owner - decr);
                      if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
                              break;
  748                 next = rw_cas(rw, owner, newown);
                      if (__predict_true(next == owner))
                              return;
                      owner = next;
              }
      
              /*
               * Grab the turnstile chain lock.  This gets the interlock
               * on the sleep queue.  Once we have that, we can adjust the
               * waiter bits.
               */
  101         ts = turnstile_lookup(rw);
              owner = rw->rw_owner;
              RW_DASSERT(rw, ts != NULL);
  101         RW_DASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
      
  101         wcnt = TS_WAITERS(ts, TS_WRITER_Q);
              rcnt = TS_WAITERS(ts, TS_READER_Q);
      
              /*
               * Give the lock away.
               *
               * If we are releasing a write lock, then prefer to wake all
               * outstanding readers.  Otherwise, wake one writer if there
               * are outstanding readers, or all writers if there are no
               * pending readers.  If waking one specific writer, the writer
               * is handed the lock here.  If waking multiple writers, we
               * set WRITE_WANTED to block out new readers, and let them
               * do the work of acquiring the lock in rw_vector_enter().
               */
   49         if (rcnt == 0 || decr == RW_READ_INCR) {
   82                 RW_DASSERT(rw, wcnt != 0);
   82                 RW_DASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
      
                      if (rcnt != 0) {
                              /* Give the lock to the longest waiting writer. */
    4                         l = TS_FIRST(ts, TS_WRITER_Q);
    4                         newown = (uintptr_t)l | RW_WRITE_LOCKED | RW_HAS_WAITERS;
                              if (wcnt > 1)
                                      newown |= RW_WRITE_WANTED;
    4                         rw_swap(rw, owner, newown);
    4                         turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
                      } else {
                              /* Wake all writers and let them fight it out. */
   81                         rw_swap(rw, owner, RW_WRITE_WANTED);
   81                         turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
                      }
              } else {
                      RW_DASSERT(rw, rcnt != 0);
      
                      /*
                       * Give the lock to all blocked readers.  If there
                       * is a writer waiting, new readers that arrive
                       * after the release will be blocked out.
                       */
   49                 newown = rcnt << RW_READ_COUNT_SHIFT;
   49                 if (wcnt != 0)
                              newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
                              
                      /* Wake up all sleeping readers. */
                      rw_swap(rw, owner, newown);
   49                 turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
              }
      }
      
      /*
       * rw_vector_tryenter:
       *
       *        Try to acquire a rwlock.
       */
      int
      rw_vector_tryenter(krwlock_t *rw, const krw_t op)
      {
              uintptr_t curthread, owner, incr, need_wait, next;
      
  360         curthread = (uintptr_t)curlwp;
      
              RW_ASSERT(rw, curthread != 0);
      
  360         if (op == RW_READER) {
                      incr = RW_READ_INCR;
                      need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
              } else {
  325                 RW_DASSERT(rw, op == RW_WRITER);
  325                 incr = curthread | RW_WRITE_LOCKED;
                      need_wait = RW_WRITE_LOCKED | RW_THREAD;
              }
      
  360         for (owner = rw->rw_owner;; owner = next) {
  360                 owner = rw->rw_owner;
                      if (__predict_false((owner & need_wait) != 0))
                              return 0;
  360                 next = rw_cas(rw, owner, owner + incr);
                      if (__predict_true(next == owner)) {
                              /* Got it! */
  360                         membar_enter();
                              break;
                      }
              }
      
  360         RW_WANTLOCK(rw, op);
  360         RW_LOCKED(rw, op);
  360         RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
                  (op == RW_READER && RW_COUNT(rw) != 0));
      
  360         return 1;
      }
      
      /*
       * rw_downgrade:
       *
       *        Downgrade a write lock to a read lock.
       */
      void
      rw_downgrade(krwlock_t *rw)
      {
              uintptr_t owner, curthread, newown, next;
              turnstile_t *ts;
              int rcnt, wcnt;
      
    5         curthread = (uintptr_t)curlwp;
              RW_ASSERT(rw, curthread != 0);
    5         RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
    5         RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    5         RW_UNLOCKED(rw, RW_WRITER);
      #if !defined(DIAGNOSTIC)
              __USE(curthread);
      #endif
      
      
    5         membar_producer();
              owner = rw->rw_owner;
              if ((owner & RW_HAS_WAITERS) == 0) {
                      /*
                       * There are no waiters, so we can do this the easy way.
                       * Try swapping us down to one read hold.  If it fails, the
                       * lock condition has changed and we most likely now have
                       * waiters.
                       */
    5                 next = rw_cas(rw, owner, RW_READ_INCR);
                      if (__predict_true(next == owner)) {
    5                         RW_LOCKED(rw, RW_READER);
    5                         RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
    5                         RW_DASSERT(rw, RW_COUNT(rw) != 0);
                              return;
                      }
                      owner = next;
              }
      
              /*
               * Grab the turnstile chain lock.  This gets the interlock
               * on the sleep queue.  Once we have that, we can adjust the
               * waiter bits.
               */
              for (;; owner = next) {
                      ts = turnstile_lookup(rw);
                      RW_DASSERT(rw, ts != NULL);
      
                      rcnt = TS_WAITERS(ts, TS_READER_Q);
                      wcnt = TS_WAITERS(ts, TS_WRITER_Q);
      
                      /*
                       * If there are no readers, just preserve the waiters
                       * bits, swap us down to one read hold and return.
                       */
                      if (rcnt == 0) {
                              RW_DASSERT(rw, wcnt != 0);
                              RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
                              RW_DASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
      
                              newown = RW_READ_INCR | RW_HAS_WAITERS | RW_WRITE_WANTED;
                              next = rw_cas(rw, owner, newown);
                              turnstile_exit(rw);
                              if (__predict_true(next == owner))
                                      break;
                      } else {
                              /*
                               * Give the lock to all blocked readers.  We may
                               * retain one read hold if downgrading.  If there
                               * is a writer waiting, new readers will be blocked
                               * out.
                               */
                              newown = (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
                              if (wcnt != 0)
                                      newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
      
                              next = rw_cas(rw, owner, newown);
                              if (__predict_true(next == owner)) {
                                      /* Wake up all sleeping readers. */
                                      turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
                                      break;
                              }
                              turnstile_exit(rw);
                      }
              }
      
              RW_WANTLOCK(rw, RW_READER);
              RW_LOCKED(rw, RW_READER);
              RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
              RW_DASSERT(rw, RW_COUNT(rw) != 0);
      }
      
      /*
       * rw_tryupgrade:
       *
       *        Try to upgrade a read lock to a write lock.  We must be the
       *        only reader.
       */
      int
      rw_tryupgrade(krwlock_t *rw)
      {
              uintptr_t owner, curthread, newown, next;
      
    7         curthread = (uintptr_t)curlwp;
              RW_ASSERT(rw, curthread != 0);
    7         RW_ASSERT(rw, rw_read_held(rw));
      
    7         for (owner = rw->rw_owner;; owner = next) {
    7                 RW_ASSERT(rw, (owner & RW_WRITE_LOCKED) == 0);
    7                 if (__predict_false((owner & RW_THREAD) != RW_READ_INCR)) {
                              RW_ASSERT(rw, (owner & RW_THREAD) != 0);
                              return 0;
                      }
    7                 newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
                      next = rw_cas(rw, owner, newown);
                      if (__predict_true(next == owner)) {
    7                         membar_producer();
                              break;
                      }
              }
      
    7         RW_UNLOCKED(rw, RW_READER);
    7         RW_WANTLOCK(rw, RW_WRITER);
    7         RW_LOCKED(rw, RW_WRITER);
    7         RW_DASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
    7         RW_DASSERT(rw, RW_OWNER(rw) == curthread);
      
              return 1;
      }
      
      /*
       * rw_read_held:
       *
       *        Returns true if the rwlock is held for reading.  Must only be
       *        used for diagnostic assertions, and never be used to make
       *         decisions about how to use a rwlock.
       */
      int
      rw_read_held(krwlock_t *rw)
      {
              uintptr_t owner;
      
   11         if (panicstr != NULL)
    4                 return 1;
   11         if (rw == NULL)
                      return 0;
   11         owner = rw->rw_owner;
   11         return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
      }
      
      /*
       * rw_write_held:
       *
       *        Returns true if the rwlock is held for writing.  Must only be
       *        used for diagnostic assertions, and never be used to make
       *        decisions about how to use a rwlock.
       */
      int
      rw_write_held(krwlock_t *rw)
      {
      
  601         if (panicstr != NULL)
                      return 1;
  601         if (rw == NULL)
                      return 0;
  601         return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
                  (RW_WRITE_LOCKED | (uintptr_t)curlwp);
      }
      
      /*
       * rw_lock_held:
       *
       *        Returns true if the rwlock is held for reading or writing.  Must
       *        only be used for diagnostic assertions, and never be used to make
       *        decisions about how to use a rwlock.
       */
      int
      rw_lock_held(krwlock_t *rw)
      {
      
   25         if (panicstr != NULL)
                      return 1;
   25         if (rw == NULL)
                      return 0;
   25         return (rw->rw_owner & RW_THREAD) != 0;
      }
      
      /*
       * rw_owner:
       *
       *        Return the current owner of an RW lock, but only if it is write
       *        held.  Used for priority inheritance.
       */
      static lwp_t *
      rw_owner(wchan_t obj)
      {
              krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
  162         uintptr_t owner = rw->rw_owner;
      
              if ((owner & RW_WRITE_LOCKED) == 0)
                      return NULL;
      
  162         return (void *)(owner & RW_THREAD);
      }
      /*        $NetBSD: kern_syscall.c,v 1.19 2019/10/06 15:11:17 uwe Exp $        */
      
      /*-
       * Copyright (c) 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software developed for The NetBSD Foundation
       * by Andrew Doran.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.19 2019/10/06 15:11:17 uwe Exp $");
      
      #ifdef _KERNEL_OPT
      #include "opt_modular.h"
      #include "opt_syscall_debug.h"
      #include "opt_ktrace.h"
      #include "opt_ptrace.h"
      #include "opt_dtrace.h"
      #endif
      
      /* XXX To get syscall prototypes. */
      #define SYSVSHM
      #define SYSVSEM
      #define SYSVMSG
      
      #include <sys/param.h>
      #include <sys/module.h>
      #include <sys/sched.h>
      #include <sys/syscall.h>
      #include <sys/syscallargs.h>
      #include <sys/syscallvar.h>
      #include <sys/systm.h>
      #include <sys/xcall.h>
      #include <sys/ktrace.h>
      #include <sys/ptrace.h>
      
      int
      sys_nomodule(struct lwp *l, const void *v, register_t *retval)
      {
      #ifdef MODULAR
      
              const struct sysent *sy;
              const struct emul *em;
              const struct sc_autoload *auto_list;
              u_int code;
      
              /*
               * Restart the syscall if we interrupted a module unload that
               * failed.  Acquiring kernconfig_lock delays us until any unload
               * has been completed or rolled back.
               */
              kernconfig_lock();
              sy = l->l_sysent;
              if (sy->sy_call != sys_nomodule) {
                      kernconfig_unlock();
                      return ERESTART;
              }
              /*
               * Try to autoload a module to satisfy the request.  If it 
               * works, retry the request.
               */
              em = l->l_proc->p_emul;
              code = sy - em->e_sysent;
      
              if ((auto_list = em->e_sc_autoload) != NULL)
                      for (; auto_list->al_code > 0; auto_list++) {
                              if (auto_list->al_code != code) {
                                      continue;
                              }
                              if (module_autoload(auto_list->al_module,
                                  MODULE_CLASS_ANY) != 0 ||
                                  sy->sy_call == sys_nomodule) {
                                          break;
                              }
                              kernconfig_unlock();
                              return ERESTART;
                      }
              kernconfig_unlock();
      #endif        /* MODULAR */
      
              return sys_nosys(l, v, retval);
      }
      
      int
      syscall_establish(const struct emul *em, const struct syscall_package *sp)
      {
              struct sysent *sy;
              int i;
      
              KASSERT(kernconfig_is_held());
      
              if (em == NULL) {
                      em = &emul_netbsd;
              }
              sy = em->e_sysent;
      
              /*
               * Ensure that all preconditions are valid, since this is
               * an all or nothing deal.  Once a system call is entered,
               * it can become busy and we could be unable to remove it
               * on error.
               */
              for (i = 0; sp[i].sp_call != NULL; i++) {
                      if (sp[i].sp_code >= SYS_NSYSENT)
                              return EINVAL;
                      if (sy[sp[i].sp_code].sy_call != sys_nomodule &&
                          sy[sp[i].sp_code].sy_call != sys_nosys) {
      #ifdef DIAGNOSTIC
                              printf("syscall %d is busy\n", sp[i].sp_code);
      #endif
                              return EBUSY;
                      }
              }
              /* Everything looks good, patch them in. */
              for (i = 0; sp[i].sp_call != NULL; i++) {
                      sy[sp[i].sp_code].sy_call = sp[i].sp_call;
              }
      
              return 0;
      }
      
      int
      syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
      {
              struct sysent *sy;
              const uint32_t *sb;
              lwp_t *l;
              int i;
      
              KASSERT(kernconfig_is_held());
      
              if (em == NULL) {
                      em = &emul_netbsd;
              }
              sy = em->e_sysent;
              sb = em->e_nomodbits;
      
              /*
               * First, patch the system calls to sys_nomodule or sys_nosys
               * to gate further activity.
               */
              for (i = 0; sp[i].sp_call != NULL; i++) {
                      KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
                      sy[sp[i].sp_code].sy_call =
                          sb[sp[i].sp_code / 32] & (1 << (sp[i].sp_code % 32)) ?
                            sys_nomodule : sys_nosys;
              }
      
              /*
               * Run a cross call to cycle through all CPUs.  This does two
               * things: lock activity provides a barrier and makes our update
               * of sy_call visible to all CPUs, and upon return we can be sure
               * that we see pertinent values of l_sysent posted by remote CPUs.
               */
              xc_barrier(0);
      
              /*
               * Now it's safe to check l_sysent.  Run through all LWPs and see
               * if anyone is still using the system call.
               */
              for (i = 0; sp[i].sp_call != NULL; i++) {
                      mutex_enter(proc_lock);
                      LIST_FOREACH(l, &alllwp, l_list) {
                              if (l->l_sysent == &sy[sp[i].sp_code]) {
                                      break;
                              }
                      }
                      mutex_exit(proc_lock);
                      if (l == NULL) {
                              continue;
                      }
                      /*
                       * We lose: one or more calls are still in use.  Put back
                       * the old entrypoints and act like nothing happened.
                       * When we drop kernconfig_lock, any system calls held in
                       * sys_nomodule() will be restarted.
                       */
                      for (i = 0; sp[i].sp_call != NULL; i++) {
                              sy[sp[i].sp_code].sy_call = sp[i].sp_call;
                      }
                      return EBUSY;
              }
      
              return 0;
      }
      
      /*
       * Return true if system call tracing is enabled for the specified process.
       */
      bool
      trace_is_enabled(struct proc *p)
      {
      #ifdef SYSCALL_DEBUG
              return (true);
      #endif
      #ifdef KTRACE
   62         if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
                      return (true);
      #endif
      #ifdef PTRACE
   62         if (ISSET(p->p_slflag, PSL_SYSCALL))
                      return (true);
      #endif
      
              return (false);
      }
      
      /*
       * Start trace of particular system call. If process is being traced,
       * this routine is called by MD syscall dispatch code just before
       * a system call is actually executed.
       */
      int
      trace_enter(register_t code, const struct sysent *sy, const void *args)
      {
              int error = 0;
      
      #ifdef KDTRACE_HOOKS
              if (sy->sy_entry) {
                      struct emul *e = curlwp->l_proc->p_emul;
                      (*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args, NULL, 0);
              }
      #endif
      
      #ifdef SYSCALL_DEBUG
              scdebug_call(code, args);
      #endif /* SYSCALL_DEBUG */
      
              ktrsyscall(code, args, sy->sy_narg);
      
      #ifdef PTRACE
              if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
                  (PSL_SYSCALL|PSL_TRACED)) {
                      proc_stoptrace(TRAP_SCE, code, args, NULL, 0);
                      if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
                              /* tracer will emulate syscall for us */
                              error = EJUSTRETURN;
                      }
              }
      #endif
              return error;
      }
      
      /*
       * End trace of particular system call. If process is being traced,
       * this routine is called by MD syscall dispatch code just after
       * a system call finishes.
       * MD caller guarantees the passed 'code' is within the supported
       * system call number range for emulation the process runs under.
       */
      void
      trace_exit(register_t code, const struct sysent *sy, const void *args,
          register_t rval[], int error)
      {
      #if defined(PTRACE) || defined(KDTRACE_HOOKS)
              struct proc *p = curlwp->l_proc;
      #endif
      
      #ifdef KDTRACE_HOOKS
              if (sy->sy_return) {
                      (*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy, args,
                          rval, error);
              }
      #endif
      
      #ifdef SYSCALL_DEBUG
              scdebug_ret(code, error, rval);
      #endif /* SYSCALL_DEBUG */
      
              ktrsysret(code, error, rval);
              
      #ifdef PTRACE
              if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
                  (PSL_SYSCALL|PSL_TRACED)) {
                      proc_stoptrace(TRAP_SCX, code, args, rval, error);
              }
              CLR(p->p_slflag, PSL_SYSCALLEMU);
      #endif
      }
      /*        $NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $        */
      
      /*-
       * Copyright (c) 2014 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Taylor R. Campbell.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $");
      
      #include <sys/types.h>
      #include <sys/param.h>
      #include <sys/bitops.h>
      #include <sys/cprng.h>
      #include <sys/cpu.h>
      #include <sys/intr.h>
      #include <sys/percpu.h>
      #include <sys/rnd.h>                /* rnd_initial_entropy */
      
      /* ChaCha core */
      
      #define        crypto_core_OUTPUTWORDS        16
      #define        crypto_core_INPUTWORDS        4
      #define        crypto_core_KEYWORDS        8
      #define        crypto_core_CONSTWORDS        4
      
      #define        crypto_core_ROUNDS        8
      
      static uint32_t
      rotate(uint32_t u, unsigned c)
      {
      
              return (u << c) | (u >> (32 - c));
      }
      
      #define        QUARTERROUND(a, b, c, d) do {                                              \
              (a) += (b); (d) ^= (a); (d) = rotate((d), 16);                              \
              (c) += (d); (b) ^= (c); (b) = rotate((b), 12);                              \
              (a) += (b); (d) ^= (a); (d) = rotate((d),  8);                              \
              (c) += (d); (b) ^= (c); (b) = rotate((b),  7);                              \
      } while (0)
      
      static void
      crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k,
          const uint32_t *c)
      {
              uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
              int i;
      
              x0 = c[0];
              x1 = c[1];
              x2 = c[2];
              x3 = c[3];
    4         x4 = k[0];
              x5 = k[1];
              x6 = k[2];
              x7 = k[3];
              x8 = k[4];
              x9 = k[5];
              x10 = k[6];
              x11 = k[7];
              x12 = in[0];
              x13 = in[1];
              x14 = in[2];
              x15 = in[3];
      
              for (i = crypto_core_ROUNDS; i > 0; i -= 2) {
    4                 QUARTERROUND( x0, x4, x8,x12);
                      QUARTERROUND( x1, x5, x9,x13);
                      QUARTERROUND( x2, x6,x10,x14);
                      QUARTERROUND( x3, x7,x11,x15);
                      QUARTERROUND( x0, x5,x10,x15);
                      QUARTERROUND( x1, x6,x11,x12);
                      QUARTERROUND( x2, x7, x8,x13);
                      QUARTERROUND( x3, x4, x9,x14);
              }
      
    4         out[0] = x0 + c[0];
              out[1] = x1 + c[1];
              out[2] = x2 + c[2];
              out[3] = x3 + c[3];
              out[4] = x4 + k[0];
              out[5] = x5 + k[1];
              out[6] = x6 + k[2];
              out[7] = x7 + k[3];
              out[8] = x8 + k[4];
              out[9] = x9 + k[5];
              out[10] = x10 + k[6];
              out[11] = x11 + k[7];
              out[12] = x12 + in[0];
              out[13] = x13 + in[1];
              out[14] = x14 + in[2];
              out[15] = x15 + in[3];
      }
      
      /* `expand 32-byte k' */
      static const uint32_t crypto_core_constant32[4] = {
              0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
      };
      
      /*
       * Test vector for ChaCha20 from
       * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>,
       * test vectors for ChaCha12 and ChaCha8 generated by the same
       * crypto_core code with crypto_core_ROUNDS varied.
       */
      
      #define        check(E)        do                                                \
      {                                                                        \
              if (!(E))                                                        \
                      panic("crypto self-test failed: %s", #E);                \
      } while (0)
      
      static void
      crypto_core_selftest(void)
      {
              const uint32_t zero32[8] = {0};
              const uint8_t sigma[] = "expand 32-byte k";
              uint32_t block[16];
              unsigned i;
      
      #if crypto_core_ROUNDS == 8
              static const uint8_t out[64] = {
                      0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6,
                      0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1,
                      0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b,
                      0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e,
                      0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41,
                      0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19,
                      0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01,
                      0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42,
              };
      #elif crypto_core_ROUNDS == 12
              static const uint8_t out[64] = {
                      0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53,
                      0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5,
                      0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14,
                      0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f,
                      0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0,
                      0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79,
                      0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19,
                      0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe,
              };
      #elif crypto_core_ROUNDS == 20
              static const uint8_t out[64] = {
                      0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90,
                      0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28,
                      0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a,
                      0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7,
                      0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d,
                      0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37,
                      0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c,
                      0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86,
              };
      #else
      #error crypto_core_ROUNDS must be 8, 12, or 20.
      #endif
      
              check(crypto_core_constant32[0] == le32dec(&sigma[0]));
              check(crypto_core_constant32[1] == le32dec(&sigma[4]));
              check(crypto_core_constant32[2] == le32dec(&sigma[8]));
              check(crypto_core_constant32[3] == le32dec(&sigma[12]));
      
              crypto_core(block, zero32, zero32, crypto_core_constant32);
              for (i = 0; i < 16; i++)
                      check(block[i] == le32dec(&out[i*4]));
      }
      
      #undef check
      
      #define        CPRNG_FAST_SEED_BYTES        (crypto_core_KEYWORDS * sizeof(uint32_t))
      
      struct cprng_fast {
              uint32_t         buffer[crypto_core_OUTPUTWORDS];
              uint32_t         key[crypto_core_KEYWORDS];
              uint32_t         nonce[crypto_core_INPUTWORDS];
              bool                have_initial;
      };
      
      __CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES);
      
      static void        cprng_fast_init_cpu(void *, void *, struct cpu_info *);
      static void        cprng_fast_schedule_reseed(struct cprng_fast *);
      static void        cprng_fast_intr(void *);
      
      static void        cprng_fast_seed(struct cprng_fast *, const void *);
      static void        cprng_fast_buf(struct cprng_fast *, void *, unsigned);
      
      static void        cprng_fast_buf_short(void *, size_t);
      static void        cprng_fast_buf_long(void *, size_t);
      
      static percpu_t        *cprng_fast_percpu        __read_mostly;
      static void        *cprng_fast_softint        __read_mostly;
      
      void
      cprng_fast_init(void)
      {
      
              crypto_core_selftest();
              cprng_fast_percpu = percpu_alloc(sizeof(struct cprng_fast));
              percpu_foreach(cprng_fast_percpu, &cprng_fast_init_cpu, NULL);
              cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
                  &cprng_fast_intr, NULL);
      }
      
      static void
      cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci __unused)
      {
              struct cprng_fast *const cprng = p;
              uint8_t seed[CPRNG_FAST_SEED_BYTES];
      
              cprng_strong(kern_cprng, seed, sizeof seed, 0);
              cprng_fast_seed(cprng, seed);
              cprng->have_initial = rnd_initial_entropy;
              (void)explicit_memset(seed, 0, sizeof seed);
      }
      
      static inline int
      cprng_fast_get(struct cprng_fast **cprngp)
      {
              struct cprng_fast *cprng;
              int s;
      
   26         *cprngp = cprng = percpu_getref(cprng_fast_percpu);
              s = splvm();
      
              if (__predict_false(!cprng->have_initial))
                      cprng_fast_schedule_reseed(cprng);
      
              return s;
      }
      
      static inline void
      cprng_fast_put(struct cprng_fast *cprng, int s)
      {
      
   26         KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
                  (percpu_putref(cprng_fast_percpu), true));
   26         splx(s);
              percpu_putref(cprng_fast_percpu);
      }
      
      static void
      cprng_fast_schedule_reseed(struct cprng_fast *cprng __unused)
      {
      
              softint_schedule(cprng_fast_softint);
      }
      
      static void
      cprng_fast_intr(void *cookie __unused)
      {
              struct cprng_fast *cprng;
              uint8_t seed[CPRNG_FAST_SEED_BYTES];
              int s;
      
              cprng_strong(kern_cprng, seed, sizeof(seed), 0);
      
              cprng = percpu_getref(cprng_fast_percpu);
              s = splvm();
              cprng_fast_seed(cprng, seed);
              cprng->have_initial = rnd_initial_entropy;
              splx(s);
              percpu_putref(cprng_fast_percpu);
      
              explicit_memset(seed, 0, sizeof(seed));
      }
      
      /* CPRNG algorithm */
      
      /*
       * The state consists of a key, the current nonce, and a 64-byte buffer
       * of output.  Since we fill the buffer only when we need output, and
       * eat a 32-bit word at a time, one 32-bit word of the buffer would be
       * wasted.  Instead, we repurpose it to count the number of entries in
       * the buffer remaining, counting from high to low in order to allow
       * comparison to zero to detect when we need to refill it.
       */
      #define        CPRNG_FAST_BUFIDX        (crypto_core_OUTPUTWORDS - 1)
      
      static void
      cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
      {
      
              (void)memset(cprng->buffer, 0, sizeof cprng->buffer);
              (void)memcpy(cprng->key, seed, sizeof cprng->key);
              (void)memset(cprng->nonce, 0, sizeof cprng->nonce);
      }
      
      static inline uint32_t
      cprng_fast_word(struct cprng_fast *cprng)
      {
              uint32_t v;
      
   26         if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) {
   26                 v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]];
              } else {
                      /* If we don't have enough words, refill the buffer.  */
    4                 crypto_core(cprng->buffer, cprng->nonce, cprng->key,
                          crypto_core_constant32);
                      if (__predict_false(++cprng->nonce[0] == 0)) {
                              cprng->nonce[1]++;
                              cprng_fast_schedule_reseed(cprng);
                      }
    4                 v = cprng->buffer[CPRNG_FAST_BUFIDX];
                      cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX;
              }
      
              return v;
      }
      
      static inline void
      cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n)
      {
              uint8_t *p = buf;
              uint32_t v;
              unsigned w, r;
      
              w = n / sizeof(uint32_t);
              while (w--) {
                      v = cprng_fast_word(cprng);
                      (void)memcpy(p, &v, 4);
                      p += 4;
              }
      
              r = n % sizeof(uint32_t);
              if (r) {
                      v = cprng_fast_word(cprng);
                      while (r--) {
                              *p++ = (v & 0xff);
                              v >>= 8;
                      }
              }
      }
      
      /*
       * crypto_onetimestream: Expand a short unpredictable one-time seed
       * into a long unpredictable output.
       */
      static void
      crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf,
          size_t n)
      {
              uint32_t block[crypto_core_OUTPUTWORDS];
              uint32_t nonce[crypto_core_INPUTWORDS] = {0};
              uint8_t *p8;
              uint32_t *p32;
              size_t ni, nb, nf;
      
              /*
               * Guarantee we can generate up to n bytes.  We have
               * 2^(32*INPUTWORDS) possible inputs yielding output of
               * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes.  It suffices to
               * require that sizeof n > (1/CHAR_BIT) log_2 n be less than
               * (1/CHAR_BIT) log_2 of the total output stream length.  We
               * have
               *
               *        log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i)
               *          = 2 + log_2 o + 32 i.
               */
              __CTASSERT(CHAR_BIT*sizeof n <=
                  (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS));
      
              p8 = buf;
              p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t));
              ni = (uint8_t *)p32 - p8;
              if (n < ni)
                      ni = n;
              nb = (n - ni) / sizeof block;
              nf = (n - ni) % sizeof block;
      
              KASSERT(((uintptr_t)p32 & 3) == 0);
              KASSERT(ni <= n);
              KASSERT(nb <= (n / sizeof block));
              KASSERT(nf <= n);
              KASSERT(n == (ni + (nb * sizeof block) + nf));
              KASSERT(ni < sizeof(uint32_t));
              KASSERT(nf < sizeof block);
      
              if (ni) {
                      crypto_core(block, nonce, seed, crypto_core_constant32);
                      nonce[0]++;
                      (void)memcpy(p8, block, ni);
              }
              while (nb--) {
                      crypto_core(p32, nonce, seed, crypto_core_constant32);
                      if (++nonce[0] == 0)
                              nonce[1]++;
                      p32 += crypto_core_OUTPUTWORDS;
              }
              if (nf) {
                      crypto_core(block, nonce, seed, crypto_core_constant32);
                      if (++nonce[0] == 0)
                              nonce[1]++;
                      (void)memcpy(p32, block, nf);
              }
      
              if (ni | nf)
                      (void)explicit_memset(block, 0, sizeof block);
      }
      
      /* Public API */
      
      uint32_t
      cprng_fast32(void)
      {
              struct cprng_fast *cprng;
              uint32_t v;
              int s;
      
   26         s = cprng_fast_get(&cprng);
   26         v = cprng_fast_word(cprng);
   26         cprng_fast_put(cprng, s);
      
              return v;
      }
      
      uint64_t
      cprng_fast64(void)
      {
              struct cprng_fast *cprng;
              uint32_t hi, lo;
              int s;
      
              s = cprng_fast_get(&cprng);
              hi = cprng_fast_word(cprng);
              lo = cprng_fast_word(cprng);
              cprng_fast_put(cprng, s);
      
              return ((uint64_t)hi << 32) | lo;
      }
      
      static void
      cprng_fast_buf_short(void *buf, size_t len)
      {
              struct cprng_fast *cprng;
              int s;
      
              s = cprng_fast_get(&cprng);
              cprng_fast_buf(cprng, buf, len);
              cprng_fast_put(cprng, s);
      }
      
      static __noinline void
      cprng_fast_buf_long(void *buf, size_t len)
      {
              uint32_t seed[crypto_core_KEYWORDS];
              struct cprng_fast *cprng;
              int s;
      
              s = cprng_fast_get(&cprng);
              cprng_fast_buf(cprng, seed, sizeof seed);
              cprng_fast_put(cprng, s);
      
              crypto_onetimestream(seed, buf, len);
      
              (void)explicit_memset(seed, 0, sizeof seed);
      }
      
      size_t
      cprng_fast(void *buf, size_t len)
      {
      
              /*
               * We don't want to hog the CPU, so we use the short version,
               * to generate output without preemption, only if we can do it
               * with at most one crypto_core.
               */
              if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS))
                      cprng_fast_buf_short(buf, len);
              else
                      cprng_fast_buf_long(buf, len);
      
              return len;
      }
      /*        $NetBSD: uvm_fault_i.h,v 1.31 2018/05/08 19:33:57 christos Exp $        */
      
      /*
       * Copyright (c) 1997 Charles D. Cranor and Washington University.
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       *
       * from: Id: uvm_fault_i.h,v 1.1.6.1 1997/12/08 16:07:12 chuck Exp
       */
      
      #ifndef _UVM_UVM_FAULT_I_H_
      #define _UVM_UVM_FAULT_I_H_
      
      /*
       * uvm_fault_i.h: fault inline functions
       */
      void uvmfault_update_stats(struct uvm_faultinfo *);
      
      
      /*
       * uvmfault_unlockmaps: unlock the maps
       */
      
      static __inline void
      uvmfault_unlockmaps(struct uvm_faultinfo *ufi, bool write_locked)
      {
              /*
               * ufi can be NULL when this isn't really a fault,
               * but merely paging in anon data.
               */
      
              if (ufi == NULL) {
                      return;
              }
      
      #ifndef __HAVE_NO_PMAP_STATS
  370         uvmfault_update_stats(ufi);
      #endif
              if (write_locked) {
                      vm_map_unlock(ufi->map);
              } else {
                      vm_map_unlock_read(ufi->map);
              }
      }
      
      /*
       * uvmfault_unlockall: unlock everything passed in.
       *
       * => maps must be read-locked (not write-locked).
       */
      
      static __inline void
      uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap,
          struct uvm_object *uobj)
      {
      
              if (uobj)
   97                 mutex_exit(uobj->vmobjlock);
  283         if (amap)
  261                 amap_unlock(amap);
  318         uvmfault_unlockmaps(ufi, false);
      }
      
      /*
       * uvmfault_lookup: lookup a virtual address in a map
       *
       * => caller must provide a uvm_faultinfo structure with the IN
       *        params properly filled in
       * => we will lookup the map entry (handling submaps) as we go
       * => if the lookup is a success we will return with the maps locked
       * => if "write_lock" is true, we write_lock the map, otherwise we only
       *        get a read lock.
       * => note that submaps can only appear in the kernel and they are
       *        required to use the same virtual addresses as the map they
       *        are referenced by (thus address translation between the main
       *        map and the submap is unnecessary).
       */
      
      static __inline bool
      uvmfault_lookup(struct uvm_faultinfo *ufi, bool write_lock)
      {
              struct vm_map *tmpmap;
      
              /*
               * init ufi values for lookup.
               */
      
  177         ufi->map = ufi->orig_map;
              ufi->size = ufi->orig_size;
      
              /*
               * keep going down levels until we are done.   note that there can
               * only be two levels so we won't loop very long.
               */
      
              for (;;) {
                      /*
                       * lock map
                       */
                      if (write_lock) {
  177                         vm_map_lock(ufi->map);
                      } else {
  370                         vm_map_lock_read(ufi->map);
                      }
      
                      /*
                       * lookup
                       */
                      if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr,
                          &ufi->entry)) {
   64                         uvmfault_unlockmaps(ufi, write_lock);
                              return(false);
                      }
      
                      /*
                       * reduce size if necessary
                       */
  331                 if (ufi->entry->end - ufi->orig_rvaddr < ufi->size)
                              ufi->size = ufi->entry->end - ufi->orig_rvaddr;
      
                      /*
                       * submap?    replace map with the submap and lookup again.
                       * note: VAs in submaps must match VAs in main map.
                       */
  331                 if (UVM_ET_ISSUBMAP(ufi->entry)) {
                              tmpmap = ufi->entry->object.sub_map;
                              if (write_lock) {
                                      vm_map_unlock(ufi->map);
                              } else {
                                      vm_map_unlock_read(ufi->map);
                              }
                              ufi->map = tmpmap;
                              continue;
                      }
      
                      /*
                       * got it!
                       */
      
  331                 ufi->mapv = ufi->map->timestamp;
                      return(true);
      
              }        /* while loop */
      
              /*NOTREACHED*/
      }
      
      /*
       * uvmfault_relock: attempt to relock the same version of the map
       *
       * => fault data structures should be unlocked before calling.
       * => if a success (true) maps will be locked after call.
       */
      
      static __inline bool
      uvmfault_relock(struct uvm_faultinfo *ufi)
      {
              /*
               * ufi can be NULL when this isn't really a fault,
               * but merely paging in anon data.
               */
      
              if (ufi == NULL) {
                      return true;
              }
      
    3         uvmexp.fltrelck++;
      
              /*
               * relock map.   fail if version mismatch (in which case nothing
               * gets locked).
               */
      
              vm_map_lock_read(ufi->map);
              if (ufi->mapv != ufi->map->timestamp) {
                      vm_map_unlock_read(ufi->map);
                      return(false);
              }
      
    3         uvmexp.fltrelckok++;
              return(true);
      }
      
      #endif /* _UVM_UVM_FAULT_I_H_ */
      /*        $NetBSD: rndsource.h,v 1.6 2018/04/19 21:19:07 christos Exp $        */
      
      /*-
       * Copyright (c) 1997 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Michael Graff <explorer@flame.org>.  This code uses ideas and
       * algorithms from the Linux driver written by Ted Ts'o.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #ifndef        _SYS_RNDSOURCE_H
      #define        _SYS_RNDSOURCE_H
      
      #ifndef _KERNEL                        /* XXX */
      #error <sys/rndsource.h> is meant for kernel consumers only.
      #endif
      
      #include <sys/types.h>
      #include <sys/null.h>
      #include <sys/rndio.h>                /* RND_TYPE_*, RND_FLAG_* */
      #include <sys/rngtest.h>
      #include <sys/queue.h>
      
      typedef struct rnd_delta_estimator {
              uint64_t        x;
              uint64_t        dx;
              uint64_t        d2x;
              uint64_t        insamples;
              uint64_t        outbits;
      } rnd_delta_t;
      
      typedef struct krndsource {
              LIST_ENTRY(krndsource) list;        /* the linked list */
              char            name[16];       /* device name */
              rnd_delta_t        time_delta;        /* time delta estimator */
              rnd_delta_t        value_delta;        /* value delta estimator */
              uint32_t        total;          /* entropy from this source */
              uint32_t        type;           /* type */
              uint32_t        flags;          /* flags */
              void            *state;         /* state information */
              size_t          test_cnt;       /* how much test data accumulated? */
              void                (*get)(size_t, void *);        /* pool wants N bytes (badly) */
              void                *getarg;        /* argument to get-function */
              void                (*enable)(struct krndsource *, bool); /* turn on/off */
              rngtest_t        *test;                /* test data for RNG type sources */
              unsigned        refcnt;
      } krndsource_t;
      
      static __inline void
      rndsource_setcb(struct krndsource *const rs, void (*const cb)(size_t, void *),
          void *const arg)
      {
              rs->get = cb;
              rs->getarg = arg;
      }
      
      static __inline void
      rndsource_setenable(struct krndsource *const rs, void *const cb)
      {
              rs->enable = cb;
      }
      
      #define RND_ENABLED(rp) \
              (((rp)->flags & RND_FLAG_NO_COLLECT) == 0)
      
      void                _rnd_add_uint32(krndsource_t *, uint32_t);
      void                _rnd_add_uint64(krndsource_t *, uint64_t);
      void                rnd_add_data(krndsource_t *, const void *const, uint32_t,
                          uint32_t);
      void                rnd_add_data_sync(krndsource_t *, const void *, uint32_t,
                          uint32_t);
      void                rnd_attach_source(krndsource_t *, const char *,
                          uint32_t, uint32_t);
      void                rnd_detach_source(krndsource_t *);
      
      static __inline void
      rnd_add_uint32(krndsource_t *kr, uint32_t val)
      {
              if (__predict_true(kr)) {
    8                 if (RND_ENABLED(kr)) {
    8                         _rnd_add_uint32(kr, val);
                      }
              } else {
                      rnd_add_data(NULL, &val, sizeof(val), 0);
              }
      }
      
      static __inline void
      rnd_add_uint64(krndsource_t *kr, uint64_t val)
      {
              if (__predict_true(kr)) {
                      if (RND_ENABLED(kr)) {
                              _rnd_add_uint64(kr, val);
                      }
              } else {
                      rnd_add_data(NULL, &val, sizeof(val), 0);
              }
      }
      
      #endif        /* _SYS_RNDSOURCE_H */
      /*        $NetBSD: subr_workqueue.c,v 1.37 2018/06/13 05:26:12 ozaki-r Exp $        */
      
      /*-
       * Copyright (c)2002, 2005, 2006, 2007 YAMAMOTO Takashi,
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: subr_workqueue.c,v 1.37 2018/06/13 05:26:12 ozaki-r Exp $");
      
      #include <sys/param.h>
      #include <sys/cpu.h>
      #include <sys/systm.h>
      #include <sys/kthread.h>
      #include <sys/kmem.h>
      #include <sys/proc.h>
      #include <sys/workqueue.h>
      #include <sys/mutex.h>
      #include <sys/condvar.h>
      #include <sys/queue.h>
      
      typedef struct work_impl {
              SIMPLEQ_ENTRY(work_impl) wk_entry;
      } work_impl_t;
      
      SIMPLEQ_HEAD(workqhead, work_impl);
      
      struct workqueue_queue {
              kmutex_t q_mutex;
              kcondvar_t q_cv;
              struct workqhead q_queue_pending;
              struct workqhead q_queue_running;
              lwp_t *q_worker;
              work_impl_t *q_waiter;
      };
      
      struct workqueue {
              void (*wq_func)(struct work *, void *);
              void *wq_arg;
              int wq_flags;
      
              char wq_name[MAXCOMLEN];
              pri_t wq_prio;
              void *wq_ptr;
      };
      
      #define        WQ_SIZE                (roundup2(sizeof(struct workqueue), coherency_unit))
      #define        WQ_QUEUE_SIZE        (roundup2(sizeof(struct workqueue_queue), coherency_unit))
      
      #define        POISON        0xaabbccdd
      
      static size_t
      workqueue_size(int flags)
      {
      
              return WQ_SIZE
                  + ((flags & WQ_PERCPU) != 0 ? ncpu : 1) * WQ_QUEUE_SIZE
                  + coherency_unit;
      }
      
      static struct workqueue_queue *
      workqueue_queue_lookup(struct workqueue *wq, struct cpu_info *ci)
      {
              u_int idx = 0;
      
              if (wq->wq_flags & WQ_PERCPU) {
                      idx = ci ? cpu_index(ci) : cpu_index(curcpu());
              }
      
   35         return (void *)((uintptr_t)(wq) + WQ_SIZE + (idx * WQ_QUEUE_SIZE));
      }
      
      static void
      workqueue_runlist(struct workqueue *wq, struct workqhead *list)
      {
              work_impl_t *wk;
              work_impl_t *next;
      
              /*
               * note that "list" is not a complete SIMPLEQ.
               */
      
              for (wk = SIMPLEQ_FIRST(list); wk != NULL; wk = next) {
                      next = SIMPLEQ_NEXT(wk, wk_entry);
                      (*wq->wq_func)((void *)wk, wq->wq_arg);
              }
      }
      
      static void
      workqueue_worker(void *cookie)
      {
              struct workqueue *wq = cookie;
              struct workqueue_queue *q;
      
              /* find the workqueue of this kthread */
              q = workqueue_queue_lookup(wq, curlwp->l_cpu);
      
              for (;;) {
                      /*
                       * we violate abstraction of SIMPLEQ.
                       */
      
                      mutex_enter(&q->q_mutex);
                      while (SIMPLEQ_EMPTY(&q->q_queue_pending))
                              cv_wait(&q->q_cv, &q->q_mutex);
                      KASSERT(SIMPLEQ_EMPTY(&q->q_queue_running));
                      q->q_queue_running.sqh_first =
                          q->q_queue_pending.sqh_first; /* XXX */
                      SIMPLEQ_INIT(&q->q_queue_pending);
                      mutex_exit(&q->q_mutex);
      
                      workqueue_runlist(wq, &q->q_queue_running);
      
                      mutex_enter(&q->q_mutex);
                      KASSERT(!SIMPLEQ_EMPTY(&q->q_queue_running));
                      SIMPLEQ_INIT(&q->q_queue_running);
                      if (__predict_false(q->q_waiter != NULL)) {
                              /* Wake up workqueue_wait */
                              cv_signal(&q->q_cv);
                      }
                      mutex_exit(&q->q_mutex);
              }
      }
      
      static void
      workqueue_init(struct workqueue *wq, const char *name,
          void (*callback_func)(struct work *, void *), void *callback_arg,
          pri_t prio, int ipl)
      {
      
              KASSERT(sizeof(wq->wq_name) > strlen(name));
              strncpy(wq->wq_name, name, sizeof(wq->wq_name));
      
              wq->wq_prio = prio;
              wq->wq_func = callback_func;
              wq->wq_arg = callback_arg;
      }
      
      static int
      workqueue_initqueue(struct workqueue *wq, struct workqueue_queue *q,
          int ipl, struct cpu_info *ci)
      {
              int error, ktf;
      
              KASSERT(q->q_worker == NULL);
      
              mutex_init(&q->q_mutex, MUTEX_DEFAULT, ipl);
              cv_init(&q->q_cv, wq->wq_name);
              SIMPLEQ_INIT(&q->q_queue_pending);
              SIMPLEQ_INIT(&q->q_queue_running);
              ktf = ((wq->wq_flags & WQ_MPSAFE) != 0 ? KTHREAD_MPSAFE : 0);
              if (wq->wq_prio < PRI_KERNEL)
                      ktf |= KTHREAD_TS;
              if (ci) {
                      error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
                          wq, &q->q_worker, "%s/%u", wq->wq_name, ci->ci_index);
              } else {
                      error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
                          wq, &q->q_worker, "%s", wq->wq_name);
              }
              if (error != 0) {
                      mutex_destroy(&q->q_mutex);
                      cv_destroy(&q->q_cv);
                      KASSERT(q->q_worker == NULL);
              }
              return error;
      }
      
      struct workqueue_exitargs {
              work_impl_t wqe_wk;
              struct workqueue_queue *wqe_q;
      };
      
      static void
      workqueue_exit(struct work *wk, void *arg)
      {
              struct workqueue_exitargs *wqe = (void *)wk;
              struct workqueue_queue *q = wqe->wqe_q;
      
              /*
               * only competition at this point is workqueue_finiqueue.
               */
      
              KASSERT(q->q_worker == curlwp);
              KASSERT(SIMPLEQ_EMPTY(&q->q_queue_pending));
              mutex_enter(&q->q_mutex);
              q->q_worker = NULL;
              cv_signal(&q->q_cv);
              mutex_exit(&q->q_mutex);
              kthread_exit(0);
      }
      
      static void
      workqueue_finiqueue(struct workqueue *wq, struct workqueue_queue *q)
      {
              struct workqueue_exitargs wqe;
      
              KASSERT(wq->wq_func == workqueue_exit);
      
              wqe.wqe_q = q;
              KASSERT(SIMPLEQ_EMPTY(&q->q_queue_pending));
              KASSERT(q->q_worker != NULL);
              mutex_enter(&q->q_mutex);
              SIMPLEQ_INSERT_TAIL(&q->q_queue_pending, &wqe.wqe_wk, wk_entry);
              cv_signal(&q->q_cv);
              while (q->q_worker != NULL) {
                      cv_wait(&q->q_cv, &q->q_mutex);
              }
              mutex_exit(&q->q_mutex);
              mutex_destroy(&q->q_mutex);
              cv_destroy(&q->q_cv);
      }
      
      /* --- */
      
      int
      workqueue_create(struct workqueue **wqp, const char *name,
          void (*callback_func)(struct work *, void *), void *callback_arg,
          pri_t prio, int ipl, int flags)
      {
              struct workqueue *wq;
              struct workqueue_queue *q;
              void *ptr;
              int error = 0;
      
              CTASSERT(sizeof(work_impl_t) <= sizeof(struct work));
      
              ptr = kmem_zalloc(workqueue_size(flags), KM_SLEEP);
              wq = (void *)roundup2((uintptr_t)ptr, coherency_unit);
              wq->wq_ptr = ptr;
              wq->wq_flags = flags;
      
              workqueue_init(wq, name, callback_func, callback_arg, prio, ipl);
      
              if (flags & WQ_PERCPU) {
                      struct cpu_info *ci;
                      CPU_INFO_ITERATOR cii;
      
                      /* create the work-queue for each CPU */
                      for (CPU_INFO_FOREACH(cii, ci)) {
                              q = workqueue_queue_lookup(wq, ci);
                              error = workqueue_initqueue(wq, q, ipl, ci);
                              if (error) {
                                      break;
                              }
                      }
              } else {
                      /* initialize a work-queue */
                      q = workqueue_queue_lookup(wq, NULL);
                      error = workqueue_initqueue(wq, q, ipl, NULL);
              }
      
              if (error != 0) {
                      workqueue_destroy(wq);
              } else {
                      *wqp = wq;
              }
      
              return error;
      }
      
      static bool
      workqueue_q_wait(struct workqueue_queue *q, work_impl_t *wk_target)
      {
              work_impl_t *wk;
              bool found = false;
      
              mutex_enter(&q->q_mutex);
              if (q->q_worker == curlwp)
                      goto out;
          again:
              SIMPLEQ_FOREACH(wk, &q->q_queue_pending, wk_entry) {
                      if (wk == wk_target)
                              goto found;
              }
              SIMPLEQ_FOREACH(wk, &q->q_queue_running, wk_entry) {
                      if (wk == wk_target)
                              goto found;
              }
          found:
              if (wk != NULL) {
                      found = true;
                      KASSERT(q->q_waiter == NULL);
                      q->q_waiter = wk;
                      cv_wait(&q->q_cv, &q->q_mutex);
                      goto again;
              }
              if (q->q_waiter != NULL)
                      q->q_waiter = NULL;
          out:
              mutex_exit(&q->q_mutex);
      
              return found;
      }
      
      /*
       * Wait for a specified work to finish.  The caller must ensure that no new
       * work will be enqueued before calling workqueue_wait.  Note that if the
       * workqueue is WQ_PERCPU, the caller can enqueue a new work to another queue
       * other than the waiting queue.
       */
      void
      workqueue_wait(struct workqueue *wq, struct work *wk)
      {
              struct workqueue_queue *q;
              bool found;
      
              if (ISSET(wq->wq_flags, WQ_PERCPU)) {
                      struct cpu_info *ci;
                      CPU_INFO_ITERATOR cii;
                      for (CPU_INFO_FOREACH(cii, ci)) {
                              q = workqueue_queue_lookup(wq, ci);
                              found = workqueue_q_wait(q, (work_impl_t *)wk);
                              if (found)
                                      break;
                      }
              } else {
                      q = workqueue_queue_lookup(wq, NULL);
                      (void) workqueue_q_wait(q, (work_impl_t *)wk);
              }
      }
      
      void
      workqueue_destroy(struct workqueue *wq)
      {
              struct workqueue_queue *q;
              struct cpu_info *ci;
              CPU_INFO_ITERATOR cii;
      
              wq->wq_func = workqueue_exit;
              for (CPU_INFO_FOREACH(cii, ci)) {
                      q = workqueue_queue_lookup(wq, ci);
                      if (q->q_worker != NULL) {
                              workqueue_finiqueue(wq, q);
                      }
              }
              kmem_free(wq->wq_ptr, workqueue_size(wq->wq_flags));
      }
      
      #ifdef DEBUG
      static void
      workqueue_check_duplication(struct workqueue_queue *q, work_impl_t *wk)
      {
              work_impl_t *_wk;
      
   35         SIMPLEQ_FOREACH(_wk, &q->q_queue_pending, wk_entry) {
    1                 if (_wk == wk)
                              panic("%s: tried to enqueue a queued work", __func__);
              }
      }
      #endif
      
      void
      workqueue_enqueue(struct workqueue *wq, struct work *wk0, struct cpu_info *ci)
      {
              struct workqueue_queue *q;
              work_impl_t *wk = (void *)wk0;
      
   35         KASSERT(wq->wq_flags & WQ_PERCPU || ci == NULL);
   35         q = workqueue_queue_lookup(wq, ci);
      
              mutex_enter(&q->q_mutex);
              KASSERT(q->q_waiter == NULL);
      #ifdef DEBUG
   35         workqueue_check_duplication(q, wk);
      #endif
   35         SIMPLEQ_INSERT_TAIL(&q->q_queue_pending, wk, wk_entry);
              cv_signal(&q->q_cv);
              mutex_exit(&q->q_mutex);
      }
      /*        $NetBSD: kern_hook.c,v 1.6 2013/11/22 21:04:11 christos Exp $        */
      
      /*-
       * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
       * NASA Ames Research Center, and by Luke Mewburn.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_hook.c,v 1.6 2013/11/22 21:04:11 christos Exp $");
      
      #include <sys/param.h>
      #include <sys/malloc.h>
      #include <sys/rwlock.h>
      #include <sys/systm.h>
      #include <sys/device.h>
      
      /*
       * A generic linear hook.
       */
      struct hook_desc {
              LIST_ENTRY(hook_desc) hk_list;
              void        (*hk_fn)(void *);
              void        *hk_arg;
      };
      typedef LIST_HEAD(, hook_desc) hook_list_t;
      
      int        powerhook_debug = 0;
      
      static void *
      hook_establish(hook_list_t *list, void (*fn)(void *), void *arg)
      {
              struct hook_desc *hd;
      
              hd = malloc(sizeof(*hd), M_DEVBUF, M_NOWAIT);
              if (hd == NULL)
                      return (NULL);
      
              hd->hk_fn = fn;
              hd->hk_arg = arg;
              LIST_INSERT_HEAD(list, hd, hk_list);
      
              return (hd);
      }
      
      static void
      hook_disestablish(hook_list_t *list, void *vhook)
      {
      #ifdef DIAGNOSTIC
              struct hook_desc *hd;
      
              LIST_FOREACH(hd, list, hk_list) {
                      if (hd == vhook)
                              break;
              }
      
              if (hd == NULL)
                      panic("hook_disestablish: hook %p not established", vhook);
      #endif
              LIST_REMOVE((struct hook_desc *)vhook, hk_list);
              free(vhook, M_DEVBUF);
      }
      
      static void
      hook_destroy(hook_list_t *list)
      {
              struct hook_desc *hd;
      
              while ((hd = LIST_FIRST(list)) != NULL) {
                      LIST_REMOVE(hd, hk_list);
                      free(hd, M_DEVBUF);
              }
      }
      
      static void
      hook_proc_run(hook_list_t *list, struct proc *p)
      {
              struct hook_desc *hd;
      
              LIST_FOREACH(hd, list, hk_list)
                      ((void (*)(struct proc *, void *))*hd->hk_fn)(p, hd->hk_arg);
      }
      
      /*
       * "Shutdown hook" types, functions, and variables.
       *
       * Should be invoked immediately before the
       * system is halted or rebooted, i.e. after file systems unmounted,
       * after crash dump done, etc.
       *
       * Each shutdown hook is removed from the list before it's run, so that
       * it won't be run again.
       */
      
      static hook_list_t shutdownhook_list = LIST_HEAD_INITIALIZER(shutdownhook_list);
      
      void *
      shutdownhook_establish(void (*fn)(void *), void *arg)
      {
              return hook_establish(&shutdownhook_list, fn, arg);
      }
      
      void
      shutdownhook_disestablish(void *vhook)
      {
              hook_disestablish(&shutdownhook_list, vhook);
      }
      
      /*
       * Run shutdown hooks.  Should be invoked immediately before the
       * system is halted or rebooted, i.e. after file systems unmounted,
       * after crash dump done, etc.
       *
       * Each shutdown hook is removed from the list before it's run, so that
       * it won't be run again.
       */
      void
      doshutdownhooks(void)
      {
              struct hook_desc *dp;
      
              while ((dp = LIST_FIRST(&shutdownhook_list)) != NULL) {
                      LIST_REMOVE(dp, hk_list);
                      (*dp->hk_fn)(dp->hk_arg);
      #if 0
                      /*
                       * Don't bother freeing the hook structure,, since we may
                       * be rebooting because of a memory corruption problem,
                       * and this might only make things worse.  It doesn't
                       * matter, anyway, since the system is just about to
                       * reboot.
                       */
                      free(dp, M_DEVBUF);
      #endif
              }
      }
      
      /*
       * "Mountroot hook" types, functions, and variables.
       */
      
      static hook_list_t mountroothook_list=LIST_HEAD_INITIALIZER(mountroothook_list);
      
      void *
      mountroothook_establish(void (*fn)(device_t), device_t dev)
      {
              return hook_establish(&mountroothook_list, (void (*)(void *))fn, dev);
      }
      
      void
      mountroothook_disestablish(void *vhook)
      {
              hook_disestablish(&mountroothook_list, vhook);
      }
      
      void
      mountroothook_destroy(void)
      {
              hook_destroy(&mountroothook_list);
      }
      
      void
      domountroothook(device_t therootdev)
      {
              struct hook_desc *hd;
      
              LIST_FOREACH(hd, &mountroothook_list, hk_list) {
                      if (hd->hk_arg == therootdev) {
                              (*hd->hk_fn)(hd->hk_arg);
                              return;
                      }
              }
      }
      
      static hook_list_t exechook_list = LIST_HEAD_INITIALIZER(exechook_list);
      
      void *
      exechook_establish(void (*fn)(struct proc *, void *), void *arg)
      {
              return hook_establish(&exechook_list, (void (*)(void *))fn, arg);
      }
      
      void
      exechook_disestablish(void *vhook)
      {
              hook_disestablish(&exechook_list, vhook);
      }
      
      /*
       * Run exec hooks.
       */
      void
      doexechooks(struct proc *p)
      {
              hook_proc_run(&exechook_list, p);
      }
      
      static hook_list_t exithook_list = LIST_HEAD_INITIALIZER(exithook_list);
      extern krwlock_t exec_lock;
      
      void *
      exithook_establish(void (*fn)(struct proc *, void *), void *arg)
      {
              void *rv;
      
              rw_enter(&exec_lock, RW_WRITER);
              rv = hook_establish(&exithook_list, (void (*)(void *))fn, arg);
              rw_exit(&exec_lock);
              return rv;
      }
      
      void
      exithook_disestablish(void *vhook)
      {
      
              rw_enter(&exec_lock, RW_WRITER);
              hook_disestablish(&exithook_list, vhook);
              rw_exit(&exec_lock);
      }
      
      /*
       * Run exit hooks.
       */
      void
      doexithooks(struct proc *p)
      {
              hook_proc_run(&exithook_list, p);
      }
      
      static hook_list_t forkhook_list = LIST_HEAD_INITIALIZER(forkhook_list);
      
      void *
      forkhook_establish(void (*fn)(struct proc *, struct proc *))
      {
              return hook_establish(&forkhook_list, (void (*)(void *))fn, NULL);
      }
      
      void
      forkhook_disestablish(void *vhook)
      {
              hook_disestablish(&forkhook_list, vhook);
      }
      
      /*
       * Run fork hooks.
       */
      void
      doforkhooks(struct proc *p2, struct proc *p1)
      {
              struct hook_desc *hd;
      
   58         LIST_FOREACH(hd, &forkhook_list, hk_list) {
                      ((void (*)(struct proc *, struct proc *))*hd->hk_fn)
                          (p2, p1);
              }
      }
      
      static hook_list_t critpollhook_list = LIST_HEAD_INITIALIZER(critpollhook_list);
      
      void *
      critpollhook_establish(void (*fn)(void *), void *arg)
      {
              return hook_establish(&critpollhook_list, fn, arg);
      }
      
      void
      critpollhook_disestablish(void *vhook)
      {
              hook_disestablish(&critpollhook_list, vhook);
      }
      
      /*
       * Run critical polling hooks.
       */
      void
      docritpollhooks(void)
      {
              struct hook_desc *hd;
      
   39         LIST_FOREACH(hd, &critpollhook_list, hk_list) {
   39                 (*hd->hk_fn)(hd->hk_arg);
              }
      }
      
      /*
       * "Power hook" types, functions, and variables.
       * The list of power hooks is kept ordered with the last registered hook
       * first.
       * When running the hooks on power down the hooks are called in reverse
       * registration order, when powering up in registration order.
       */
      struct powerhook_desc {
              TAILQ_ENTRY(powerhook_desc) sfd_list;
              void        (*sfd_fn)(int, void *);
              void        *sfd_arg;
              char        sfd_name[16];
      };
      
      static TAILQ_HEAD(powerhook_head, powerhook_desc) powerhook_list =
          TAILQ_HEAD_INITIALIZER(powerhook_list);
      
      void *
      powerhook_establish(const char *name, void (*fn)(int, void *), void *arg)
      {
              struct powerhook_desc *ndp;
      
              ndp = (struct powerhook_desc *)
                  malloc(sizeof(*ndp), M_DEVBUF, M_NOWAIT);
              if (ndp == NULL)
                      return (NULL);
      
              ndp->sfd_fn = fn;
              ndp->sfd_arg = arg;
              strlcpy(ndp->sfd_name, name, sizeof(ndp->sfd_name));
              TAILQ_INSERT_HEAD(&powerhook_list, ndp, sfd_list);
      
              aprint_error("%s: WARNING: powerhook_establish is deprecated\n", name);
              return (ndp);
      }
      
      void
      powerhook_disestablish(void *vhook)
      {
      #ifdef DIAGNOSTIC
              struct powerhook_desc *dp;
      
              TAILQ_FOREACH(dp, &powerhook_list, sfd_list)
                      if (dp == vhook)
                              goto found;
              panic("powerhook_disestablish: hook %p not established", vhook);
       found:
      #endif
      
              TAILQ_REMOVE(&powerhook_list, (struct powerhook_desc *)vhook,
                  sfd_list);
              free(vhook, M_DEVBUF);
      }
      
      /*
       * Run power hooks.
       */
      void
      dopowerhooks(int why)
      {
              struct powerhook_desc *dp;
              const char *why_name;
              static const char * pwr_names[] = {PWR_NAMES};
              why_name = why < __arraycount(pwr_names) ? pwr_names[why] : "???";
      
              if (why == PWR_RESUME || why == PWR_SOFTRESUME) {
                      TAILQ_FOREACH_REVERSE(dp, &powerhook_list, powerhook_head,
                          sfd_list)
                      {
                              if (powerhook_debug)
                                      printf("dopowerhooks %s: %s (%p)\n",
                                          why_name, dp->sfd_name, dp);
                              (*dp->sfd_fn)(why, dp->sfd_arg);
                      }
              } else {
                      TAILQ_FOREACH(dp, &powerhook_list, sfd_list) {
                              if (powerhook_debug)
                                      printf("dopowerhooks %s: %s (%p)\n",
                                          why_name, dp->sfd_name, dp);
                              (*dp->sfd_fn)(why, dp->sfd_arg);
                      }
              }
      
              if (powerhook_debug)
                      printf("dopowerhooks: %s done\n", why_name);
      }
      /*        $NetBSD: in.h,v 1.108 2018/11/09 11:46:28 maya Exp $        */
      
      /*
       * Copyright (c) 1982, 1986, 1990, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)in.h        8.3 (Berkeley) 1/3/94
       */
      
      /*
       * Constants and structures defined by the internet system,
       * Per RFC 790, September 1981, and numerous additions.
       */
      
      #ifndef _NETINET_IN_H_
      #define        _NETINET_IN_H_
      
      #include <sys/featuretest.h>
      #include <machine/int_types.h>
      
      #ifndef        _BSD_UINT8_T_
      typedef __uint8_t        uint8_t;
      #define        _BSD_UINT8_T_
      #endif
      
      #ifndef        _BSD_UINT32_T_
      typedef __uint32_t        uint32_t;
      #define        _BSD_UINT32_T_
      #endif
      
      #include <sys/ansi.h>
      
      #ifndef in_addr_t
      typedef __in_addr_t        in_addr_t;
      #define        in_addr_t        __in_addr_t
      #endif
      
      #ifndef in_port_t
      typedef __in_port_t        in_port_t;
      #define        in_port_t        __in_port_t
      #endif
      
      #ifndef sa_family_t
      typedef __sa_family_t        sa_family_t;
      #define        sa_family_t        __sa_family_t
      #endif
      
      /*
       * Protocols
       */
      #define        IPPROTO_IP                0                /* dummy for IP */
      #define        IPPROTO_HOPOPTS                0                /* IP6 hop-by-hop options */
      #define        IPPROTO_ICMP                1                /* control message protocol */
      #define        IPPROTO_IGMP                2                /* group mgmt protocol */
      #define        IPPROTO_GGP                3                /* gateway^2 (deprecated) */
      #define        IPPROTO_IPV4                4                 /* IP header */
      #define        IPPROTO_IPIP                4                /* IP inside IP */
      #define        IPPROTO_TCP                6                /* tcp */
      #define        IPPROTO_EGP                8                /* exterior gateway protocol */
      #define        IPPROTO_PUP                12                /* pup */
      #define        IPPROTO_UDP                17                /* user datagram protocol */
      #define        IPPROTO_IDP                22                /* xns idp */
      #define        IPPROTO_TP                29                 /* tp-4 w/ class negotiation */
      #define        IPPROTO_DCCP                33                /* DCCP */
      #define        IPPROTO_IPV6                41                /* IP6 header */
      #define        IPPROTO_ROUTING                43                /* IP6 routing header */
      #define        IPPROTO_FRAGMENT        44                /* IP6 fragmentation header */
      #define        IPPROTO_RSVP                46                /* resource reservation */
      #define        IPPROTO_GRE                47                /* GRE encaps RFC 1701 */
      #define        IPPROTO_ESP                50                 /* encap. security payload */
      #define        IPPROTO_AH                51                 /* authentication header */
      #define        IPPROTO_MOBILE                55                /* IP Mobility RFC 2004 */
      #define        IPPROTO_IPV6_ICMP        58                /* IPv6 ICMP */
      #define        IPPROTO_ICMPV6                58                /* ICMP6 */
      #define        IPPROTO_NONE                59                /* IP6 no next header */
      #define        IPPROTO_DSTOPTS                60                /* IP6 destination option */
      #define        IPPROTO_EON                80                /* ISO cnlp */
      #define        IPPROTO_ETHERIP                97                /* Ethernet-in-IP */
      #define        IPPROTO_ENCAP                98                /* encapsulation header */
      #define        IPPROTO_PIM                103                /* Protocol indep. multicast */
      #define        IPPROTO_IPCOMP                108                /* IP Payload Comp. Protocol */
      #define        IPPROTO_VRRP                112                /* VRRP RFC 2338 */
      #define        IPPROTO_CARP                112                /* Common Address Resolution Protocol */
      #define        IPPROTO_L2TP                115                /* L2TPv3 */
      #define        IPPROTO_SCTP                132                /* SCTP */
      #define IPPROTO_PFSYNC      240     /* PFSYNC */
      #define        IPPROTO_RAW                255                /* raw IP packet */
      #define        IPPROTO_MAX                256
      
      /* last return value of *_input(), meaning "all job for this pkt is done".  */
      #define        IPPROTO_DONE                257
      
      /* sysctl placeholder for (FAST_)IPSEC */
      #define CTL_IPPROTO_IPSEC        258
      
      
      /*
       * Local port number conventions:
       *
       * Ports < IPPORT_RESERVED are reserved for privileged processes (e.g. root),
       * unless a kernel is compiled with IPNOPRIVPORTS defined.
       *
       * When a user does a bind(2) or connect(2) with a port number of zero,
       * a non-conflicting local port address is chosen.
       *
       * The default range is IPPORT_ANONMIN to IPPORT_ANONMAX, although
       * that is settable by sysctl(3); net.inet.ip.anonportmin and
       * net.inet.ip.anonportmax respectively.
       *
       * A user may set the IPPROTO_IP option IP_PORTRANGE to change this
       * default assignment range.
       *
       * The value IP_PORTRANGE_DEFAULT causes the default behavior.
       *
       * The value IP_PORTRANGE_HIGH is the same as IP_PORTRANGE_DEFAULT,
       * and exists only for FreeBSD compatibility purposes.
       *
       * The value IP_PORTRANGE_LOW changes the range to the "low" are
       * that is (by convention) restricted to privileged processes.
       * This convention is based on "vouchsafe" principles only.
       * It is only secure if you trust the remote host to restrict these ports.
       * The range is IPPORT_RESERVEDMIN to IPPORT_RESERVEDMAX.
       */
      
      #define        IPPORT_RESERVED                1024
      #define        IPPORT_ANONMIN                49152
      #define        IPPORT_ANONMAX                65535
      #define        IPPORT_RESERVEDMIN        600
      #define        IPPORT_RESERVEDMAX        (IPPORT_RESERVED-1)
      
      /*
       * Internet address (a structure for historical reasons)
       */
      struct in_addr {
              in_addr_t s_addr;
      } __packed;
      
      /*
       * Definitions of bits in internet address integers.
       * On subnets, the decomposition of addresses to host and net parts
       * is done according to subnet mask, not the masks here.
       *
       * By byte-swapping the constants, we avoid ever having to byte-swap IP
       * addresses inside the kernel.  Unfortunately, user-level programs rely
       * on these macros not doing byte-swapping.
       */
      #ifdef _KERNEL
      #define        __IPADDR(x)        ((uint32_t) htonl((uint32_t)(x)))
      #else
      #define        __IPADDR(x)        ((uint32_t)(x))
      #endif
      
      #define        IN_CLASSA(i)                (((uint32_t)(i) & __IPADDR(0x80000000)) == \
                                       __IPADDR(0x00000000))
      #define        IN_CLASSA_NET                __IPADDR(0xff000000)
      #define        IN_CLASSA_NSHIFT        24
      #define        IN_CLASSA_HOST                __IPADDR(0x00ffffff)
      #define        IN_CLASSA_MAX                128
      
      #define        IN_CLASSB(i)                (((uint32_t)(i) & __IPADDR(0xc0000000)) == \
                                       __IPADDR(0x80000000))
      #define        IN_CLASSB_NET                __IPADDR(0xffff0000)
      #define        IN_CLASSB_NSHIFT        16
      #define        IN_CLASSB_HOST                __IPADDR(0x0000ffff)
      #define        IN_CLASSB_MAX                65536
      
      #define        IN_CLASSC(i)                (((uint32_t)(i) & __IPADDR(0xe0000000)) == \
                                       __IPADDR(0xc0000000))
      #define        IN_CLASSC_NET                __IPADDR(0xffffff00)
      #define        IN_CLASSC_NSHIFT        8
      #define        IN_CLASSC_HOST                __IPADDR(0x000000ff)
      
      #define        IN_CLASSD(i)                (((uint32_t)(i) & __IPADDR(0xf0000000)) == \
                                       __IPADDR(0xe0000000))
      /* These ones aren't really net and host fields, but routing needn't know. */
      #define        IN_CLASSD_NET                __IPADDR(0xf0000000)
      #define        IN_CLASSD_NSHIFT        28
      #define        IN_CLASSD_HOST                __IPADDR(0x0fffffff)
      #define        IN_MULTICAST(i)                IN_CLASSD(i)
      
      #define        IN_EXPERIMENTAL(i)        (((uint32_t)(i) & __IPADDR(0xf0000000)) == \
                                       __IPADDR(0xf0000000))
      #define        IN_BADCLASS(i)                (((uint32_t)(i) & __IPADDR(0xf0000000)) == \
                                       __IPADDR(0xf0000000))
      
      #define IN_LINKLOCAL(i)        (((uint32_t)(i) & __IPADDR(0xffff0000)) == \
                               __IPADDR(0xa9fe0000))
      
      #define        IN_PRIVATE(i)        ((((uint32_t)(i) & __IPADDR(0xff000000)) ==        \
                                __IPADDR(0x0a000000))        ||                        \
                               (((uint32_t)(i) & __IPADDR(0xfff00000)) ==        \
                                __IPADDR(0xac100000))        ||                        \
                               (((uint32_t)(i) & __IPADDR(0xffff0000)) ==        \
                                __IPADDR(0xc0a80000)))
      
      #define        IN_LOCAL_GROUP(i)        (((uint32_t)(i) & __IPADDR(0xffffff00)) == \
                                       __IPADDR(0xe0000000))
      
      #define        IN_ANY_LOCAL(i)                (IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i))
      
      #define        INADDR_ANY                __IPADDR(0x00000000)
      #define        INADDR_LOOPBACK                __IPADDR(0x7f000001)
      #define        INADDR_BROADCAST        __IPADDR(0xffffffff)        /* must be masked */
      #define        INADDR_NONE                __IPADDR(0xffffffff)        /* -1 return */
      
      #define        INADDR_UNSPEC_GROUP        __IPADDR(0xe0000000)        /* 224.0.0.0 */
      #define        INADDR_ALLHOSTS_GROUP        __IPADDR(0xe0000001)        /* 224.0.0.1 */
      #define        INADDR_ALLRTRS_GROUP        __IPADDR(0xe0000002)        /* 224.0.0.2 */
      #define        INADDR_CARP_GROUP        __IPADDR(0xe0000012)        /* 224.0.0.18 */
      #define        INADDR_MAX_LOCAL_GROUP        __IPADDR(0xe00000ff)        /* 224.0.0.255 */
      
      #define        IN_LOOPBACKNET                127                        /* official! */
      
      /*
       * Socket address, internet style.
       */
      struct sockaddr_in {
              uint8_t                sin_len;
              sa_family_t        sin_family;
              in_port_t        sin_port;
              struct in_addr        sin_addr;
              __int8_t        sin_zero[8];
      };
      
      #define        INET_ADDRSTRLEN                 16
      
      /*
       * Structure used to describe IP options.
       * Used to store options internally, to pass them to a process,
       * or to restore options retrieved earlier.
       * The ip_dst is used for the first-hop gateway when using a source route
       * (this gets put into the header proper).
       */
      struct ip_opts {
              struct in_addr        ip_dst;                /* first hop, 0 w/o src rt */
      #if defined(__cplusplus)
              __int8_t        Ip_opts[40];        /* actually variable in size */
      #else
              __int8_t        ip_opts[40];        /* actually variable in size */
      #endif
      };
      
      /*
       * Options for use with [gs]etsockopt at the IP level.
       * First word of comment is data type; bool is stored in int.
       */
      #define        IP_OPTIONS                1    /* buf/ip_opts; set/get IP options */
      #define        IP_HDRINCL                2    /* int; header is included with data */
      #define        IP_TOS                        3    /* int; IP type of service and preced. */
      #define        IP_TTL                        4    /* int; IP time to live */
      #define        IP_RECVOPTS                5    /* bool; receive all IP opts w/dgram */
      #define        IP_RECVRETOPTS                6    /* bool; receive IP opts for response */
      #define        IP_RECVDSTADDR                7    /* bool; receive IP dst addr w/dgram */
      #define        IP_RETOPTS                8    /* ip_opts; set/get IP options */
      #define        IP_MULTICAST_IF                9    /* in_addr; set/get IP multicast i/f  */
      #define        IP_MULTICAST_TTL        10   /* u_char; set/get IP multicast ttl */
      #define        IP_MULTICAST_LOOP        11   /* u_char; set/get IP multicast loopback */
      /* The add and drop membership option numbers need to match with the v6 ones */
      #define        IP_ADD_MEMBERSHIP        12   /* ip_mreq; add an IP group membership */
      #define        IP_DROP_MEMBERSHIP        13   /* ip_mreq; drop an IP group membership */
      #define        IP_PORTALGO                18   /* int; port selection algo (rfc6056) */
      #define        IP_PORTRANGE                19   /* int; range to use for ephemeral port */
      #define        IP_RECVIF                20   /* bool; receive reception if w/dgram */
      #define        IP_ERRORMTU                21   /* int; get MTU of last xmit = EMSGSIZE */
      #define        IP_IPSEC_POLICY                22   /* struct; get/set security policy */
      #define        IP_RECVTTL                23   /* bool; receive IP TTL w/dgram */
      #define        IP_MINTTL                24   /* minimum TTL for packet or drop */
      #define        IP_PKTINFO                25   /* struct; set default src if/addr */
      #define        IP_RECVPKTINFO                26   /* int; receive dst if/addr w/dgram */
      
      #define IP_SENDSRCADDR IP_RECVDSTADDR /* FreeBSD compatibility */
      
      /*
       * Information sent in the control message of a datagram socket for
       * IP_PKTINFO and IP_RECVPKTINFO.
       */
      struct in_pktinfo {
              struct in_addr        ipi_addr;        /* src/dst address */
              unsigned int ipi_ifindex;        /* interface index */
      };
      
      #define ipi_spec_dst ipi_addr        /* Solaris/Linux compatibility */
      
      /*
       * Defaults and limits for options
       */
      #define        IP_DEFAULT_MULTICAST_TTL  1        /* normally limit m'casts to 1 hop  */
      #define        IP_DEFAULT_MULTICAST_LOOP 1        /* normally hear sends if a member  */
      #define        IP_MAX_MEMBERSHIPS        20        /* per socket; must fit in one mbuf */
      
      /*
       * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP.
       */
      struct ip_mreq {
              struct        in_addr imr_multiaddr;        /* IP multicast address of group */
              struct        in_addr imr_interface;        /* local IP address of interface */
      };
      
      /*
       * Argument for IP_PORTRANGE:
       * - which range to search when port is unspecified at bind() or connect()
       */
      #define        IP_PORTRANGE_DEFAULT        0        /* default range */
      #define        IP_PORTRANGE_HIGH        1        /* same as DEFAULT (FreeBSD compat) */
      #define        IP_PORTRANGE_LOW        2        /* use privileged range */
      
      #if defined(_NETBSD_SOURCE)
      /*
       * Definitions for inet sysctl operations.
       *
       * Third level is protocol number.
       * Fourth level is desired variable within that protocol.
       */
      
      /*
       * Names for IP sysctl objects
       */
      #define        IPCTL_FORWARDING        1        /* act as router */
      #define        IPCTL_SENDREDIRECTS        2        /* may send redirects when forwarding */
      #define        IPCTL_DEFTTL                3        /* default TTL */
      /* IPCTL_DEFMTU=4, never implemented */
      #define        IPCTL_FORWSRCRT                5        /* forward source-routed packets */
      #define        IPCTL_DIRECTEDBCAST        6        /* default broadcast behavior */
      #define        IPCTL_ALLOWSRCRT        7        /* allow/drop all source-routed pkts */
      #define        IPCTL_SUBNETSARELOCAL        8        /* treat subnets as local addresses */
      #define        IPCTL_MTUDISC                9        /* allow path MTU discovery */
      #define        IPCTL_ANONPORTMIN      10        /* minimum ephemeral port */
      #define        IPCTL_ANONPORTMAX      11        /* maximum ephemeral port */
      #define        IPCTL_MTUDISCTIMEOUT   12        /* allow path MTU discovery */
      #define        IPCTL_MAXFLOWS         13        /* maximum ip flows allowed */
      #define        IPCTL_HOSTZEROBROADCAST 14        /* is host zero a broadcast addr? */
      #define        IPCTL_GIF_TTL                15        /* default TTL for gif encap packet */
      #define        IPCTL_LOWPORTMIN       16        /* minimum reserved port */
      #define        IPCTL_LOWPORTMAX       17        /* maximum reserved port */
      #define        IPCTL_MAXFRAGPACKETS   18        /* max packets reassembly queue */
      #define        IPCTL_GRE_TTL          19        /* default TTL for gre encap packet */
      #define        IPCTL_CHECKINTERFACE   20        /* drop pkts in from 'wrong' iface */
      #define        IPCTL_IFQ               21        /* IP packet input queue */
      #define        IPCTL_RANDOMID               22        /* use random IP ids (if configured) */
      #define        IPCTL_LOOPBACKCKSUM    23        /* do IP checksum on loopback */
      #define        IPCTL_STATS                24        /* IP statistics */
      #define        IPCTL_DAD_COUNT        25        /* DAD packets to send */
      
      #endif /* _NETBSD_SOURCE */
      
      /* INET6 stuff */
      #define        __KAME_NETINET_IN_H_INCLUDED_
      #include <netinet6/in6.h>
      #undef __KAME_NETINET_IN_H_INCLUDED_
      
      #ifdef _KERNEL
      #include <sys/psref.h>
      
      /*
       * in_cksum_phdr:
       *
       *        Compute significant parts of the IPv4 checksum pseudo-header
       *        for use in a delayed TCP/UDP checksum calculation.
       *
       *        Args:
       *
       *                src                Source IP address
       *                dst                Destination IP address
       *                lenproto        htons(proto-hdr-len + proto-number)
       */
      static __inline u_int16_t __unused
      in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto)
      {
              u_int32_t sum;
      
              sum = lenproto +
                    (u_int16_t)(src >> 16) +
                    (u_int16_t)(src /*& 0xffff*/) +
                    (u_int16_t)(dst >> 16) +
                    (u_int16_t)(dst /*& 0xffff*/);
      
              sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/);
      
              if (sum > 0xffff)
                      sum -= 0xffff;
      
   11         return (sum);
      }
      
      /*
       * in_cksum_addword:
       *
       *        Add the two 16-bit network-order values, carry, and return.
       */
      static __inline u_int16_t __unused
      in_cksum_addword(u_int16_t a, u_int16_t b)
      {
              u_int32_t sum = a + b;
      
              if (sum > 0xffff)
    1                 sum -= 0xffff;
      
   20         return (sum);
      }
      
      extern        struct in_addr zeroin_addr;
      extern        u_char        ip_protox[];
      extern const struct sockaddr_in in_any;
      
      int        in_broadcast(struct in_addr, struct ifnet *);
      int        in_direct(struct in_addr, struct ifnet *);
      int        in_canforward(struct in_addr);
      int        cpu_in_cksum(struct mbuf *, int, int, uint32_t);
      int        in_cksum(struct mbuf *, int);
      int        in4_cksum(struct mbuf *, u_int8_t, int, int);
      int        in_localaddr(struct in_addr);
      void        in_socktrim(struct sockaddr_in *);
      
      void        in_if_link_up(struct ifnet *);
      void        in_if_link_down(struct ifnet *);
      void        in_if_up(struct ifnet *);
      void        in_if_down(struct ifnet *);
      void        in_if_link_state_change(struct ifnet *, int);
      
      struct route;
      struct ip_moptions;
      
      struct in_ifaddr *in_selectsrc(struct sockaddr_in *,
              struct route *, int, struct ip_moptions *, int *, struct psref *);
      
      struct ip;
      int in_tunnel_validate(const struct ip *, struct in_addr, struct in_addr);
      
      #define        in_hosteq(s,t)        ((s).s_addr == (t).s_addr)
      #define        in_nullhost(x)        ((x).s_addr == INADDR_ANY)
      
      #define        satosin(sa)        ((struct sockaddr_in *)(sa))
      #define        satocsin(sa)        ((const struct sockaddr_in *)(sa))
      #define        sintosa(sin)        ((struct sockaddr *)(sin))
      #define        sintocsa(sin)        ((const struct sockaddr *)(sin))
      #define        ifatoia(ifa)        ((struct in_ifaddr *)(ifa))
      
      int sockaddr_in_cmp(const struct sockaddr *, const struct sockaddr *);
      const void *sockaddr_in_const_addr(const struct sockaddr *, socklen_t *);
      void *sockaddr_in_addr(struct sockaddr *, socklen_t *);
      
      static __inline void
      sockaddr_in_init1(struct sockaddr_in *sin, const struct in_addr *addr,
          in_port_t port)
      {
              sin->sin_port = port;
              sin->sin_addr = *addr;
              memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
      }
      
      static __inline void
      sockaddr_in_init(struct sockaddr_in *sin, const struct in_addr *addr,
          in_port_t port)
      {
   41         sin->sin_family = AF_INET;
              sin->sin_len = sizeof(*sin);
              sockaddr_in_init1(sin, addr, port);
      }
      
      static __inline struct sockaddr *
      sockaddr_in_alloc(const struct in_addr *addr, in_port_t port, int flags)
      {
              struct sockaddr *sa;
      
              sa = sockaddr_alloc(AF_INET, sizeof(struct sockaddr_in), flags);
      
              if (sa == NULL)
                      return NULL;
      
              sockaddr_in_init1(satosin(sa), addr, port);
      
              return sa;
      }
      #endif /* _KERNEL */
      
      #if defined(_KERNEL) || defined(_TEST)
      int        in_print(char *, size_t, const struct in_addr *);
      #define IN_PRINT(b, a)        (in_print((b), sizeof(b), a), (b))
      int        sin_print(char *, size_t, const void *);
      #endif
      
      #endif /* !_NETINET_IN_H_ */
      /*        $NetBSD: tcp_input.c,v 1.416 2019/09/25 19:06:30 jnemeth Exp $        */
      
      /*
       * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the project nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      /*
       *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
       *
       * NRL grants permission for redistribution and use in source and binary
       * forms, with or without modification, of the software and documentation
       * created at NRL provided that the following conditions are met:
       *
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. All advertising materials mentioning features or use of this software
       *    must display the following acknowledgements:
       *      This product includes software developed by the University of
       *      California, Berkeley and its contributors.
       *      This product includes software developed at the Information
       *      Technology Division, US Naval Research Laboratory.
       * 4. Neither the name of the NRL nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
       * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
       * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
       * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
       * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
       * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
       * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
       * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
       * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
       * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       *
       * The views and conclusions contained in the software and documentation
       * are those of the authors and should not be interpreted as representing
       * official policies, either expressed or implied, of the US Naval
       * Research Laboratory (NRL).
       */
      
      /*-
       * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
       * 2011 The NetBSD Foundation, Inc.
       * All rights reserved.
       *
       * This code is derived from software contributed to The NetBSD Foundation
       * by Coyote Point Systems, Inc.
       * This code is derived from software contributed to The NetBSD Foundation
       * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
       * Facility, NASA Ames Research Center.
       * This code is derived from software contributed to The NetBSD Foundation
       * by Charles M. Hannum.
       * This code is derived from software contributed to The NetBSD Foundation
       * by Rui Paulo.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
       * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
       * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
       * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       * POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)tcp_input.c        8.12 (Berkeley) 5/24/95
       */
      
      /*
       *        TODO list for SYN cache stuff:
       *
       *        Find room for a "state" field, which is needed to keep a
       *        compressed state for TIME_WAIT TCBs.  It's been noted already
       *        that this is fairly important for very high-volume web and
       *        mail servers, which use a large number of short-lived
       *        connections.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.416 2019/09/25 19:06:30 jnemeth Exp $");
      
      #ifdef _KERNEL_OPT
      #include "opt_inet.h"
      #include "opt_ipsec.h"
      #include "opt_inet_csum.h"
      #include "opt_tcp_debug.h"
      #endif
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/malloc.h>
      #include <sys/mbuf.h>
      #include <sys/protosw.h>
      #include <sys/socket.h>
      #include <sys/socketvar.h>
      #include <sys/errno.h>
      #include <sys/syslog.h>
      #include <sys/pool.h>
      #include <sys/domain.h>
      #include <sys/kernel.h>
      #ifdef TCP_SIGNATURE
      #include <sys/md5.h>
      #endif
      #include <sys/lwp.h> /* for lwp0 */
      #include <sys/cprng.h>
      
      #include <net/if.h>
      #include <net/if_types.h>
      
      #include <netinet/in.h>
      #include <netinet/in_systm.h>
      #include <netinet/ip.h>
      #include <netinet/in_pcb.h>
      #include <netinet/in_var.h>
      #include <netinet/ip_var.h>
      #include <netinet/in_offload.h>
      
      #ifdef INET6
      #include <netinet/ip6.h>
      #include <netinet6/ip6_var.h>
      #include <netinet6/in6_pcb.h>
      #include <netinet6/ip6_var.h>
      #include <netinet6/in6_var.h>
      #include <netinet/icmp6.h>
      #include <netinet6/nd6.h>
      #ifdef TCP_SIGNATURE
      #include <netinet6/scope6_var.h>
      #endif
      #endif
      
      #ifndef INET6
      #include <netinet/ip6.h>
      #endif
      
      #include <netinet/tcp.h>
      #include <netinet/tcp_fsm.h>
      #include <netinet/tcp_seq.h>
      #include <netinet/tcp_timer.h>
      #include <netinet/tcp_var.h>
      #include <netinet/tcp_private.h>
      #include <netinet/tcp_congctl.h>
      #include <netinet/tcp_debug.h>
      
      #ifdef INET6
      #include "faith.h"
      #if defined(NFAITH) && NFAITH > 0
      #include <net/if_faith.h>
      #endif
      #endif
      
      #ifdef IPSEC
      #include <netipsec/ipsec.h>
      #include <netipsec/key.h>
      #ifdef INET6
      #include <netipsec/ipsec6.h>
      #endif
      #endif        /* IPSEC*/
      
      #include <netinet/tcp_vtw.h>
      
      int        tcprexmtthresh = 3;
      int        tcp_log_refused;
      
      int        tcp_do_autorcvbuf = 1;
      int        tcp_autorcvbuf_inc = 16 * 1024;
      int        tcp_autorcvbuf_max = 256 * 1024;
      int        tcp_msl = (TCPTV_MSL / PR_SLOWHZ);
      
      static int tcp_rst_ppslim_count = 0;
      static struct timeval tcp_rst_ppslim_last;
      static int tcp_ackdrop_ppslim_count = 0;
      static struct timeval tcp_ackdrop_ppslim_last;
      
      static void syn_cache_timer(void *);
      
      #define TCP_PAWS_IDLE        (24U * 24 * 60 * 60 * PR_SLOWHZ)
      
      /* for modulo comparisons of timestamps */
      #define TSTMP_LT(a,b)        ((int)((a)-(b)) < 0)
      #define TSTMP_GEQ(a,b)        ((int)((a)-(b)) >= 0)
      
      /*
       * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
       */
      #ifdef INET6
      static inline void
      nd6_hint(struct tcpcb *tp)
      {
              struct rtentry *rt = NULL;
      
              if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 &&
                  (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL)
                      nd6_nud_hint(rt);
              rtcache_unref(rt, &tp->t_in6pcb->in6p_route);
      }
      #else
      static inline void
      nd6_hint(struct tcpcb *tp)
      {
      }
      #endif
      
      /*
       * Compute ACK transmission behavior.  Delay the ACK unless
       * we have already delayed an ACK (must send an ACK every two segments).
       * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
       * option is enabled.
       */
      static void
      tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th)
      {
      
              if (tp->t_flags & TF_DELACK ||
                  (tcp_ack_on_push && th->th_flags & TH_PUSH))
                      tp->t_flags |= TF_ACKNOW;
              else
                      TCP_SET_DELACK(tp);
      }
      
      static void
      icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked)
      {
      
              /*
               * If we had a pending ICMP message that refers to data that have
               * just been acknowledged, disregard the recorded ICMP message.
               */
              if ((tp->t_flags & TF_PMTUD_PEND) &&
                  SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
                      tp->t_flags &= ~TF_PMTUD_PEND;
      
              /*
               * Keep track of the largest chunk of data
               * acknowledged since last PMTU update
               */
              if (tp->t_pmtud_mss_acked < acked)
                      tp->t_pmtud_mss_acked = acked;
      }
      
      /*
       * Convert TCP protocol fields to host order for easier processing.
       */
      static void
      tcp_fields_to_host(struct tcphdr *th)
      {
      
              NTOHL(th->th_seq);
              NTOHL(th->th_ack);
              NTOHS(th->th_win);
              NTOHS(th->th_urp);
      }
      
      /*
       * ... and reverse the above.
       */
      static void
      tcp_fields_to_net(struct tcphdr *th)
      {
      
              HTONL(th->th_seq);
              HTONL(th->th_ack);
              HTONS(th->th_win);
              HTONS(th->th_urp);
      }
      
      static void
      tcp_urp_drop(struct tcphdr *th, int todrop, int *tiflags)
      {
              if (th->th_urp > todrop) {
                      th->th_urp -= todrop;
              } else {
                      *tiflags &= ~TH_URG;
                      th->th_urp = 0;
              }
      }
      
      #ifdef TCP_CSUM_COUNTERS
      #include <sys/device.h>
      
      extern struct evcnt tcp_hwcsum_ok;
      extern struct evcnt tcp_hwcsum_bad;
      extern struct evcnt tcp_hwcsum_data;
      extern struct evcnt tcp_swcsum;
      #if defined(INET6)
      extern struct evcnt tcp6_hwcsum_ok;
      extern struct evcnt tcp6_hwcsum_bad;
      extern struct evcnt tcp6_hwcsum_data;
      extern struct evcnt tcp6_swcsum;
      #endif /* defined(INET6) */
      
      #define        TCP_CSUM_COUNTER_INCR(ev)        (ev)->ev_count++
      
      #else
      
      #define        TCP_CSUM_COUNTER_INCR(ev)        /* nothing */
      
      #endif /* TCP_CSUM_COUNTERS */
      
      #ifdef TCP_REASS_COUNTERS
      #include <sys/device.h>
      
      extern struct evcnt tcp_reass_;
      extern struct evcnt tcp_reass_empty;
      extern struct evcnt tcp_reass_iteration[8];
      extern struct evcnt tcp_reass_prependfirst;
      extern struct evcnt tcp_reass_prepend;
      extern struct evcnt tcp_reass_insert;
      extern struct evcnt tcp_reass_inserttail;
      extern struct evcnt tcp_reass_append;
      extern struct evcnt tcp_reass_appendtail;
      extern struct evcnt tcp_reass_overlaptail;
      extern struct evcnt tcp_reass_overlapfront;
      extern struct evcnt tcp_reass_segdup;
      extern struct evcnt tcp_reass_fragdup;
      
      #define        TCP_REASS_COUNTER_INCR(ev)        (ev)->ev_count++
      
      #else
      
      #define        TCP_REASS_COUNTER_INCR(ev)        /* nothing */
      
      #endif /* TCP_REASS_COUNTERS */
      
      static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *,
          int);
      static int tcp_dooptions(struct tcpcb *, const u_char *, int,
          struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
      
      static void tcp4_log_refused(const struct ip *, const struct tcphdr *);
      #ifdef INET6
      static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *);
      #endif
      
      #if defined(MBUFTRACE)
      struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass");
      #endif /* defined(MBUFTRACE) */
      
      static struct pool tcpipqent_pool;
      
      void
      tcpipqent_init(void)
      {
      
              pool_init(&tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl",
                  NULL, IPL_VM);
      }
      
      struct ipqent *
      tcpipqent_alloc(void)
      {
              struct ipqent *ipqe;
              int s;
      
              s = splvm();
              ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
              splx(s);
      
              return ipqe;
      }
      
      void
      tcpipqent_free(struct ipqent *ipqe)
      {
              int s;
      
              s = splvm();
              pool_put(&tcpipqent_pool, ipqe);
              splx(s);
      }
      
      /*
       * Insert segment ti into reassembly queue of tcp with
       * control block tp.  Return TH_FIN if reassembly now includes
       * a segment with FIN.
       */
      static int
      tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int tlen)
      {
              struct ipqent *p, *q, *nq, *tiqe = NULL;
              struct socket *so = NULL;
              int pkt_flags;
              tcp_seq pkt_seq;
              unsigned pkt_len;
              u_long rcvpartdupbyte = 0;
              u_long rcvoobyte;
      #ifdef TCP_REASS_COUNTERS
              u_int count = 0;
      #endif
              uint64_t *tcps;
      
              if (tp->t_inpcb)
                      so = tp->t_inpcb->inp_socket;
      #ifdef INET6
              else if (tp->t_in6pcb)
                      so = tp->t_in6pcb->in6p_socket;
      #endif
      
              TCP_REASS_LOCK_CHECK(tp);
      
              /*
               * Call with th==NULL after become established to
               * force pre-ESTABLISHED data up to user socket.
               */
              if (th == NULL)
                      goto present;
      
              m_claimm(m, &tcp_reass_mowner);
      
              rcvoobyte = tlen;
              /*
               * Copy these to local variables because the TCP header gets munged
               * while we are collapsing mbufs.
               */
              pkt_seq = th->th_seq;
              pkt_len = tlen;
              pkt_flags = th->th_flags;
      
              TCP_REASS_COUNTER_INCR(&tcp_reass_);
      
              if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
                      /*
                       * When we miss a packet, the vast majority of time we get
                       * packets that follow it in order.  So optimize for that.
                       */
                      if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
                              p->ipqe_len += pkt_len;
                              p->ipqe_flags |= pkt_flags;
                              m_cat(p->ipqe_m, m);
                              m = NULL;
                              tiqe = p;
                              TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
                              TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
                              goto skip_replacement;
                      }
                      /*
                       * While we're here, if the pkt is completely beyond
                       * anything we have, just insert it at the tail.
                       */
                      if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
                              TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
                              goto insert_it;
                      }
              }
      
              q = TAILQ_FIRST(&tp->segq);
      
              if (q != NULL) {
                      /*
                       * If this segment immediately precedes the first out-of-order
                       * block, simply slap the segment in front of it and (mostly)
                       * skip the complicated logic.
                       */
                      if (pkt_seq + pkt_len == q->ipqe_seq) {
                              q->ipqe_seq = pkt_seq;
                              q->ipqe_len += pkt_len;
                              q->ipqe_flags |= pkt_flags;
                              m_cat(m, q->ipqe_m);
                              q->ipqe_m = m;
                              tiqe = q;
                              TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
                              TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
                              goto skip_replacement;
                      }
              } else {
                      TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
              }
      
              /*
               * Find a segment which begins after this one does.
               */
              for (p = NULL; q != NULL; q = nq) {
                      nq = TAILQ_NEXT(q, ipqe_q);
      #ifdef TCP_REASS_COUNTERS
                      count++;
      #endif
      
                      /*
                       * If the received segment is just right after this
                       * fragment, merge the two together and then check
                       * for further overlaps.
                       */
                      if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
                              pkt_len += q->ipqe_len;
                              pkt_flags |= q->ipqe_flags;
                              pkt_seq = q->ipqe_seq;
                              m_cat(q->ipqe_m, m);
                              m = q->ipqe_m;
                              TCP_REASS_COUNTER_INCR(&tcp_reass_append);
                              goto free_ipqe;
                      }
      
                      /*
                       * If the received segment is completely past this
                       * fragment, we need to go to the next fragment.
                       */
                      if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
                              p = q;
                              continue;
                      }
      
                      /*
                       * If the fragment is past the received segment,
                       * it (or any following) can't be concatenated.
                       */
                      if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
                              TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
                              break;
                      }
      
                      /*
                       * We've received all the data in this segment before.
                       * Mark it as a duplicate and return.
                       */
                      if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
                          SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
                              tcps = TCP_STAT_GETREF();
                              tcps[TCP_STAT_RCVDUPPACK]++;
                              tcps[TCP_STAT_RCVDUPBYTE] += pkt_len;
                              TCP_STAT_PUTREF();
                              tcp_new_dsack(tp, pkt_seq, pkt_len);
                              m_freem(m);
                              if (tiqe != NULL) {
                                      tcpipqent_free(tiqe);
                              }
                              TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
                              goto out;
                      }
      
                      /*
                       * Received segment completely overlaps this fragment
                       * so we drop the fragment (this keeps the temporal
                       * ordering of segments correct).
                       */
                      if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
                          SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
                              rcvpartdupbyte += q->ipqe_len;
                              m_freem(q->ipqe_m);
                              TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
                              goto free_ipqe;
                      }
      
                      /*
                       * Received segment extends past the end of the fragment.
                       * Drop the overlapping bytes, merge the fragment and
                       * segment, and treat as a longer received packet.
                       */
                      if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
                          SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq))  {
                              int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
                              m_adj(m, overlap);
                              rcvpartdupbyte += overlap;
                              m_cat(q->ipqe_m, m);
                              m = q->ipqe_m;
                              pkt_seq = q->ipqe_seq;
                              pkt_len += q->ipqe_len - overlap;
                              rcvoobyte -= overlap;
                              TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
                              goto free_ipqe;
                      }
      
                      /*
                       * Received segment extends past the front of the fragment.
                       * Drop the overlapping bytes on the received packet. The
                       * packet will then be concatenated with this fragment a
                       * bit later.
                       */
                      if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
                          SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len))  {
                              int overlap = pkt_seq + pkt_len - q->ipqe_seq;
                              m_adj(m, -overlap);
                              pkt_len -= overlap;
                              rcvpartdupbyte += overlap;
                              TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
                              rcvoobyte -= overlap;
                      }
      
                      /*
                       * If the received segment immediately precedes this
                       * fragment then tack the fragment onto this segment
                       * and reinsert the data.
                       */
                      if (q->ipqe_seq == pkt_seq + pkt_len) {
                              pkt_len += q->ipqe_len;
                              pkt_flags |= q->ipqe_flags;
                              m_cat(m, q->ipqe_m);
                              TAILQ_REMOVE(&tp->segq, q, ipqe_q);
                              TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
                              tp->t_segqlen--;
                              KASSERT(tp->t_segqlen >= 0);
                              KASSERT(tp->t_segqlen != 0 ||
                                  (TAILQ_EMPTY(&tp->segq) &&
                                  TAILQ_EMPTY(&tp->timeq)));
                              if (tiqe == NULL) {
                                      tiqe = q;
                              } else {
                                      tcpipqent_free(q);
                              }
                              TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
                              break;
                      }
      
                      /*
                       * If the fragment is before the segment, remember it.
                       * When this loop is terminated, p will contain the
                       * pointer to the fragment that is right before the
                       * received segment.
                       */
                      if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
                              p = q;
      
                      continue;
      
                      /*
                       * This is a common operation.  It also will allow
                       * to save doing a malloc/free in most instances.
                       */
                free_ipqe:
                      TAILQ_REMOVE(&tp->segq, q, ipqe_q);
                      TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
                      tp->t_segqlen--;
                      KASSERT(tp->t_segqlen >= 0);
                      KASSERT(tp->t_segqlen != 0 ||
                          (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
                      if (tiqe == NULL) {
                              tiqe = q;
                      } else {
                              tcpipqent_free(q);
                      }
              }
      
      #ifdef TCP_REASS_COUNTERS
              if (count > 7)
                      TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
              else if (count > 0)
                      TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
      #endif
      
      insert_it:
              /*
               * Allocate a new queue entry (block) since the received segment
               * did not collapse onto any other out-of-order block. If it had
               * collapsed, tiqe would not be NULL and we would be reusing it.
               *
               * If the allocation fails, drop the packet.
               */
              if (tiqe == NULL) {
                      tiqe = tcpipqent_alloc();
                      if (tiqe == NULL) {
                              TCP_STATINC(TCP_STAT_RCVMEMDROP);
                              m_freem(m);
                              goto out;
                      }
              }
      
              /*
               * Update the counters.
               */
              tp->t_rcvoopack++;
              tcps = TCP_STAT_GETREF();
              tcps[TCP_STAT_RCVOOPACK]++;
              tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;
              if (rcvpartdupbyte) {
                  tcps[TCP_STAT_RCVPARTDUPPACK]++;
                  tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte;
              }
              TCP_STAT_PUTREF();
      
              /*
               * Insert the new fragment queue entry into both queues.
               */
              tiqe->ipqe_m = m;
              tiqe->ipqe_seq = pkt_seq;
              tiqe->ipqe_len = pkt_len;
              tiqe->ipqe_flags = pkt_flags;
              if (p == NULL) {
                      TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
              } else {
                      TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
              }
              tp->t_segqlen++;
      
      skip_replacement:
              TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
      
      present:
              /*
               * Present data to user, advancing rcv_nxt through
               * completed sequence space.
               */
              if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
                      goto out;
              q = TAILQ_FIRST(&tp->segq);
              if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
                      goto out;
              if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
                      goto out;
      
              tp->rcv_nxt += q->ipqe_len;
              pkt_flags = q->ipqe_flags & TH_FIN;
              nd6_hint(tp);
      
              TAILQ_REMOVE(&tp->segq, q, ipqe_q);
              TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
              tp->t_segqlen--;
              KASSERT(tp->t_segqlen >= 0);
              KASSERT(tp->t_segqlen != 0 ||
                  (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
              if (so->so_state & SS_CANTRCVMORE)
                      m_freem(q->ipqe_m);
              else
                      sbappendstream(&so->so_rcv, q->ipqe_m);
              tcpipqent_free(q);
              TCP_REASS_UNLOCK(tp);
              sorwakeup(so);
              return pkt_flags;
      
      out:
              TCP_REASS_UNLOCK(tp);
              return 0;
      }
      
      #ifdef INET6
      int
      tcp6_input(struct mbuf **mp, int *offp, int proto)
      {
              struct mbuf *m = *mp;
      
              /*
               * draft-itojun-ipv6-tcp-to-anycast
               * better place to put this in?
               */
              if (m->m_flags & M_ANYCAST6) {
                      struct ip6_hdr *ip6;
                      if (m->m_len < sizeof(struct ip6_hdr)) {
                              if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
                                      TCP_STATINC(TCP_STAT_RCVSHORT);
                                      return IPPROTO_DONE;
                              }
                      }
                      ip6 = mtod(m, struct ip6_hdr *);
                      icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
                          (char *)&ip6->ip6_dst - (char *)ip6);
                      return IPPROTO_DONE;
              }
      
              tcp_input(m, *offp, proto);
              return IPPROTO_DONE;
      }
      #endif
      
      static void
      tcp4_log_refused(const struct ip *ip, const struct tcphdr *th)
      {
              char src[INET_ADDRSTRLEN];
              char dst[INET_ADDRSTRLEN];
      
              if (ip) {
                      in_print(src, sizeof(src), &ip->ip_src);
                      in_print(dst, sizeof(dst), &ip->ip_dst);
              } else {
                      strlcpy(src, "(unknown)", sizeof(src));
                      strlcpy(dst, "(unknown)", sizeof(dst));
              }
              log(LOG_INFO,
                  "Connection attempt to TCP %s:%d from %s:%d\n",
                  dst, ntohs(th->th_dport),
                  src, ntohs(th->th_sport));
      }
      
      #ifdef INET6
      static void
      tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th)
      {
              char src[INET6_ADDRSTRLEN];
              char dst[INET6_ADDRSTRLEN];
      
              if (ip6) {
                      in6_print(src, sizeof(src), &ip6->ip6_src);
                      in6_print(dst, sizeof(dst), &ip6->ip6_dst);
              } else {
                      strlcpy(src, "(unknown v6)", sizeof(src));
                      strlcpy(dst, "(unknown v6)", sizeof(dst));
              }
              log(LOG_INFO,
                  "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
                  dst, ntohs(th->th_dport),
                  src, ntohs(th->th_sport));
      }
      #endif
      
      /*
       * Checksum extended TCP header and data.
       */
      int
      tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th,
          int toff, int off, int tlen)
      {
              struct ifnet *rcvif;
              int s;
      
              /*
               * XXX it's better to record and check if this mbuf is
               * already checked.
               */
      
              rcvif = m_get_rcvif(m, &s);
              if (__predict_false(rcvif == NULL))
                      goto badcsum; /* XXX */
      
              switch (af) {
              case AF_INET:
                      switch (m->m_pkthdr.csum_flags &
                              ((rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
                               M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
                      case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
                              TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
                              goto badcsum;
      
                      case M_CSUM_TCPv4|M_CSUM_DATA: {
                              u_int32_t hw_csum = m->m_pkthdr.csum_data;
      
                              TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
                              if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
                                      const struct ip *ip =
                                          mtod(m, const struct ip *);
      
                                      hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
                                          ip->ip_dst.s_addr,
                                          htons(hw_csum + tlen + off + IPPROTO_TCP));
                              }
                              if ((hw_csum ^ 0xffff) != 0)
                                      goto badcsum;
                              break;
                      }
      
                      case M_CSUM_TCPv4:
                              /* Checksum was okay. */
                              TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
                              break;
      
                      default:
                              /*
                               * Must compute it ourselves.  Maybe skip checksum
                               * on loopback interfaces.
                               */
                              if (__predict_true(!(rcvif->if_flags & IFF_LOOPBACK) ||
                                                 tcp_do_loopback_cksum)) {
                                      TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
                                      if (in4_cksum(m, IPPROTO_TCP, toff,
                                                    tlen + off) != 0)
                                              goto badcsum;
                              }
                              break;
                      }
                      break;
      
      #ifdef INET6
              case AF_INET6:
                      switch (m->m_pkthdr.csum_flags &
                              ((rcvif->if_csum_flags_rx & M_CSUM_TCPv6) |
                               M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
                      case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD:
                              TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad);
                              goto badcsum;
      
      #if 0 /* notyet */
                      case M_CSUM_TCPv6|M_CSUM_DATA:
      #endif
      
                      case M_CSUM_TCPv6:
                              /* Checksum was okay. */
                              TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok);
                              break;
      
                      default:
                              /*
                               * Must compute it ourselves.  Maybe skip checksum
                               * on loopback interfaces.
                               */
                              if (__predict_true((m->m_flags & M_LOOP) == 0 ||
                                  tcp_do_loopback_cksum)) {
                                      TCP_CSUM_COUNTER_INCR(&tcp6_swcsum);
                                      if (in6_cksum(m, IPPROTO_TCP, toff,
                                          tlen + off) != 0)
                                              goto badcsum;
                              }
                      }
                      break;
      #endif /* INET6 */
              }
              m_put_rcvif(rcvif, &s);
      
              return 0;
      
      badcsum:
              m_put_rcvif(rcvif, &s);
              TCP_STATINC(TCP_STAT_RCVBADSUM);
              return -1;
      }
      
      /*
       * When a packet arrives addressed to a vestigial tcpbp, we
       * nevertheless have to respond to it per the spec.
       *
       * This code is duplicated from the one in tcp_input().
       */
      static void tcp_vtw_input(struct tcphdr *th, vestigial_inpcb_t *vp,
          struct mbuf *m, int tlen)
      {
              int tiflags;
              int todrop;
              uint32_t t_flags = 0;
              uint64_t *tcps;
      
              tiflags = th->th_flags;
              todrop  = vp->rcv_nxt - th->th_seq;
      
              if (todrop > 0) {
                      if (tiflags & TH_SYN) {
                              tiflags &= ~TH_SYN;
                              th->th_seq++;
                              tcp_urp_drop(th, 1, &tiflags);
                              todrop--;
                      }
                      if (todrop > tlen ||
                          (todrop == tlen && (tiflags & TH_FIN) == 0)) {
                              /*
                               * Any valid FIN or RST must be to the left of the
                               * window.  At this point the FIN or RST must be a
                               * duplicate or out of sequence; drop it.
                               */
                              if (tiflags & TH_RST)
                                      goto drop;
                              tiflags &= ~(TH_FIN|TH_RST);
      
                              /*
                               * Send an ACK to resynchronize and drop any data.
                               * But keep on processing for RST or ACK.
                               */
                              t_flags |= TF_ACKNOW;
                              todrop = tlen;
                              tcps = TCP_STAT_GETREF();
                              tcps[TCP_STAT_RCVDUPPACK] += 1;
                              tcps[TCP_STAT_RCVDUPBYTE] += todrop;
                              TCP_STAT_PUTREF();
                      } else if ((tiflags & TH_RST) &&
                          th->th_seq != vp->rcv_nxt) {
                              /*
                               * Test for reset before adjusting the sequence
                               * number for overlapping data.
                               */
                              goto dropafterack_ratelim;
                      } else {
                              tcps = TCP_STAT_GETREF();
                              tcps[TCP_STAT_RCVPARTDUPPACK] += 1;
                              tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop;
                              TCP_STAT_PUTREF();
                      }
      
      //                tcp_new_dsack(tp, th->th_seq, todrop);
      //                hdroptlen += todrop;        /*drop from head afterwards*/
      
                      th->th_seq += todrop;
                      tlen -= todrop;
                      tcp_urp_drop(th, todrop, &tiflags);
              }
      
              /*
               * If new data are received on a connection after the
               * user processes are gone, then RST the other end.
               */
              if (tlen) {
                      TCP_STATINC(TCP_STAT_RCVAFTERCLOSE);
                      goto dropwithreset;
              }
      
              /*
               * If segment ends after window, drop trailing data
               * (and PUSH and FIN); if nothing left, just ACK.
               */
              todrop = (th->th_seq + tlen) - (vp->rcv_nxt + vp->rcv_wnd);
      
              if (todrop > 0) {
                      TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN);
                      if (todrop >= tlen) {
                              /*
                               * The segment actually starts after the window.
                               * th->th_seq + tlen - vp->rcv_nxt - vp->rcv_wnd >= tlen
                               * th->th_seq - vp->rcv_nxt - vp->rcv_wnd >= 0
                               * th->th_seq >= vp->rcv_nxt + vp->rcv_wnd
                               */
                              TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen);
      
                              /*
                               * If a new connection request is received
                               * while in TIME_WAIT, drop the old connection
                               * and start over if the sequence numbers
                               * are above the previous ones.
                               */
                              if ((tiflags & TH_SYN) &&
                                  SEQ_GT(th->th_seq, vp->rcv_nxt)) {
                                      /*
                                       * We only support this in the !NOFDREF case, which
                                       * is to say: not here.
                                       */
                                      goto dropwithreset;
                              }
      
                              /*
                               * If window is closed can only take segments at
                               * window edge, and have to drop data and PUSH from
                               * incoming segments.  Continue processing, but
                               * remember to ack.  Otherwise, drop segment
                               * and (if not RST) ack.
                               */
                              if (vp->rcv_wnd == 0 && th->th_seq == vp->rcv_nxt) {
                                      t_flags |= TF_ACKNOW;
                                      TCP_STATINC(TCP_STAT_RCVWINPROBE);
                              } else {
                                      goto dropafterack;
                              }
                      } else {
                              TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop);
                      }
                      m_adj(m, -todrop);
                      tlen -= todrop;
                      tiflags &= ~(TH_PUSH|TH_FIN);
              }
      
              if (tiflags & TH_RST) {
                      if (th->th_seq != vp->rcv_nxt)
                              goto dropafterack_ratelim;
      
                      vtw_del(vp->ctl, vp->vtw);
                      goto drop;
              }
      
              /*
               * If the ACK bit is off we drop the segment and return.
               */
              if ((tiflags & TH_ACK) == 0) {
                      if (t_flags & TF_ACKNOW)
                              goto dropafterack;
                      goto drop;
              }
      
              /*
               * In TIME_WAIT state the only thing that should arrive
               * is a retransmission of the remote FIN.  Acknowledge
               * it and restart the finack timer.
               */
              vtw_restart(vp);
              goto dropafterack;
      
      dropafterack:
              /*
               * Generate an ACK dropping incoming segment if it occupies
               * sequence space, where the ACK reflects our state.
               */
              if (tiflags & TH_RST)
                      goto drop;
              goto dropafterack2;
      
      dropafterack_ratelim:
              /*
               * We may want to rate-limit ACKs against SYN/RST attack.
               */
              if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
                  tcp_ackdrop_ppslim) == 0) {
                      /* XXX stat */
                      goto drop;
              }
              /* ...fall into dropafterack2... */
      
      dropafterack2:
              (void)tcp_respond(0, m, m, th, th->th_seq + tlen, th->th_ack, TH_ACK);
              return;
      
      dropwithreset:
              /*
               * Generate a RST, dropping incoming segment.
               * Make ACK acceptable to originator of segment.
               */
              if (tiflags & TH_RST)
                      goto drop;
      
              if (tiflags & TH_ACK) {
                      tcp_respond(0, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
              } else {
                      if (tiflags & TH_SYN)
                              ++tlen;
                      (void)tcp_respond(0, m, m, th, th->th_seq + tlen, (tcp_seq)0,
                          TH_RST|TH_ACK);
              }
              return;
      drop:
              m_freem(m);
      }
      
      /*
       * TCP input routine, follows pages 65-76 of RFC 793 very closely.
       */
      void
      tcp_input(struct mbuf *m, int off, int proto)
      {
              struct tcphdr *th;
              struct ip *ip;
              struct inpcb *inp;
      #ifdef INET6
              struct ip6_hdr *ip6;
              struct in6pcb *in6p;
      #endif
              u_int8_t *optp = NULL;
              int optlen = 0;
              int len, tlen, hdroptlen = 0;
              struct tcpcb *tp = NULL;
              int tiflags;
              struct socket *so = NULL;
              int todrop, acked, ourfinisacked, needoutput = 0;
              bool dupseg;
      #ifdef TCP_DEBUG
              short ostate = 0;
      #endif
              u_long tiwin;
              struct tcp_opt_info opti;
              int thlen, iphlen;
              int af;                /* af on the wire */
              struct mbuf *tcp_saveti = NULL;
              uint32_t ts_rtt;
              uint8_t iptos;
              uint64_t *tcps;
              vestigial_inpcb_t vestige;
      
              vestige.valid = 0;
      
              MCLAIM(m, &tcp_rx_mowner);
      
              TCP_STATINC(TCP_STAT_RCVTOTAL);
      
              memset(&opti, 0, sizeof(opti));
              opti.ts_present = 0;
              opti.maxseg = 0;
      
              /*
               * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
               *
               * TCP is, by definition, unicast, so we reject all
               * multicast outright.
               *
               * Note, there are additional src/dst address checks in
               * the AF-specific code below.
               */
              if (m->m_flags & (M_BCAST|M_MCAST)) {
                      /* XXX stat */
                      goto drop;
              }
      #ifdef INET6
              if (m->m_flags & M_ANYCAST6) {
                      /* XXX stat */
                      goto drop;
              }
      #endif
      
              M_REGION_GET(th, struct tcphdr *, m, off, sizeof(struct tcphdr));
              if (th == NULL) {
                      TCP_STATINC(TCP_STAT_RCVSHORT);
                      return;
              }
      
              /*
               * Get IP and TCP header.
               * Note: IP leaves IP header in first mbuf.
               */
              ip = mtod(m, struct ip *);
              switch (ip->ip_v) {
              case 4:
      #ifdef INET6
                      ip6 = NULL;
      #endif
                      af = AF_INET;
                      iphlen = sizeof(struct ip);
      
                      if (IN_MULTICAST(ip->ip_dst.s_addr) ||
                          in_broadcast(ip->ip_dst, m_get_rcvif_NOMPSAFE(m)))
                              goto drop;
      
                      /* We do the checksum after PCB lookup... */
                      len = ntohs(ip->ip_len);
                      tlen = len - off;
                      iptos = ip->ip_tos;
                      break;
      #ifdef INET6
              case 6:
                      ip = NULL;
                      iphlen = sizeof(struct ip6_hdr);
                      af = AF_INET6;
                      ip6 = mtod(m, struct ip6_hdr *);
      
                      /*
                       * Be proactive about unspecified IPv6 address in source.
                       * As we use all-zero to indicate unbounded/unconnected pcb,
                       * unspecified IPv6 address can be used to confuse us.
                       *
                       * Note that packets with unspecified IPv6 destination is
                       * already dropped in ip6_input.
                       */
                      if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
                              /* XXX stat */
                              goto drop;
                      }
      
                      /*
                       * Make sure destination address is not multicast.
                       * Source address checked in ip6_input().
                       */
                      if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
                              /* XXX stat */
                              goto drop;
                      }
      
                      /* We do the checksum after PCB lookup... */
                      len = m->m_pkthdr.len;
                      tlen = len - off;
                      iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
                      break;
      #endif
              default:
                      m_freem(m);
                      return;
              }
      
              /*
               * Enforce alignment requirements that are violated in
               * some cases, see kern/50766 for details.
               */
              if (TCP_HDR_ALIGNED_P(th) == 0) {
                      m = m_copyup(m, off + sizeof(struct tcphdr), 0);
                      if (m == NULL) {
                              TCP_STATINC(TCP_STAT_RCVSHORT);
                              return;
                      }
                      ip = mtod(m, struct ip *);
      #ifdef INET6
                      ip6 = mtod(m, struct ip6_hdr *);
      #endif
                      th = (struct tcphdr *)(mtod(m, char *) + off);
              }
              KASSERT(TCP_HDR_ALIGNED_P(th));
      
              /*
               * Check that TCP offset makes sense, pull out TCP options and
               * adjust length.
               */
              thlen = th->th_off << 2;
              if (thlen < sizeof(struct tcphdr) || thlen > tlen) {
                      TCP_STATINC(TCP_STAT_RCVBADOFF);
                      goto drop;
              }
              tlen -= thlen;
      
              if (thlen > sizeof(struct tcphdr)) {
                      M_REGION_GET(th, struct tcphdr *, m, off, thlen);
                      if (th == NULL) {
                              TCP_STATINC(TCP_STAT_RCVSHORT);
                              return;
                      }
                      KASSERT(TCP_HDR_ALIGNED_P(th));
                      optlen = thlen - sizeof(struct tcphdr);
                      optp = ((u_int8_t *)th) + sizeof(struct tcphdr);
      
                      /*
                       * Do quick retrieval of timestamp options.
                       *
                       * If timestamp is the only option and it's formatted as
                       * recommended in RFC 1323 appendix A, we quickly get the
                       * values now and don't bother calling tcp_dooptions(),
                       * etc.
                       */
                      if ((optlen == TCPOLEN_TSTAMP_APPA ||
                           (optlen > TCPOLEN_TSTAMP_APPA &&
                            optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
                          be32dec(optp) == TCPOPT_TSTAMP_HDR &&
                          (th->th_flags & TH_SYN) == 0) {
                              opti.ts_present = 1;
                              opti.ts_val = be32dec(optp + 4);
                              opti.ts_ecr = be32dec(optp + 8);
                              optp = NULL;        /* we've parsed the options */
                      }
              }
              tiflags = th->th_flags;
      
              /*
               * Checksum extended TCP header and data
               */
              if (tcp_input_checksum(af, m, th, off, thlen, tlen))
                      goto badcsum;
      
              /*
               * Locate pcb for segment.
               */
      findpcb:
              inp = NULL;
      #ifdef INET6
              in6p = NULL;
      #endif
              switch (af) {
              case AF_INET:
                      inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
                          ip->ip_dst, th->th_dport, &vestige);
                      if (inp == NULL && !vestige.valid) {
                              TCP_STATINC(TCP_STAT_PCBHASHMISS);
                              inp = in_pcblookup_bind(&tcbtable, ip->ip_dst,
                                  th->th_dport);
                      }
      #ifdef INET6
                      if (inp == NULL && !vestige.valid) {
                              struct in6_addr s, d;
      
                              /* mapped addr case */
                              in6_in_2_v4mapin6(&ip->ip_src, &s);
                              in6_in_2_v4mapin6(&ip->ip_dst, &d);
                              in6p = in6_pcblookup_connect(&tcbtable, &s,
                                  th->th_sport, &d, th->th_dport, 0, &vestige);
                              if (in6p == 0 && !vestige.valid) {
                                      TCP_STATINC(TCP_STAT_PCBHASHMISS);
                                      in6p = in6_pcblookup_bind(&tcbtable, &d,
                                          th->th_dport, 0);
                              }
                      }
      #endif
      #ifndef INET6
                      if (inp == NULL && !vestige.valid)
      #else
                      if (inp == NULL && in6p == NULL && !vestige.valid)
      #endif
                      {
                              TCP_STATINC(TCP_STAT_NOPORT);
                              if (tcp_log_refused &&
                                  (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
                                      tcp4_log_refused(ip, th);
                              }
                              tcp_fields_to_host(th);
                              goto dropwithreset_ratelim;
                      }
      #if defined(IPSEC)
                      if (ipsec_used) {
                              if (inp && ipsec_in_reject(m, inp)) {
                                      goto drop;
                              }
      #ifdef INET6
                              else if (in6p && ipsec_in_reject(m, in6p)) {
                                      goto drop;
                              }
      #endif
                      }
      #endif /*IPSEC*/
                      break;
      #ifdef INET6
              case AF_INET6:
                  {
                      int faith;
      
      #if defined(NFAITH) && NFAITH > 0
                      faith = faithprefix(&ip6->ip6_dst);
      #else
                      faith = 0;
      #endif
                      in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
                          th->th_sport, &ip6->ip6_dst, th->th_dport, faith, &vestige);
                      if (!in6p && !vestige.valid) {
                              TCP_STATINC(TCP_STAT_PCBHASHMISS);
                              in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
                                  th->th_dport, faith);
                      }
                      if (!in6p && !vestige.valid) {
                              TCP_STATINC(TCP_STAT_NOPORT);
                              if (tcp_log_refused &&
                                  (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
                                      tcp6_log_refused(ip6, th);
                              }
                              tcp_fields_to_host(th);
                              goto dropwithreset_ratelim;
                      }
      #if defined(IPSEC)
                      if (ipsec_used && in6p && ipsec_in_reject(m, in6p)) {
                              goto drop;
                      }
      #endif
                      break;
                  }
      #endif
              }
      
              tcp_fields_to_host(th);
      
              /*
               * If the state is CLOSED (i.e., TCB does not exist) then
               * all data in the incoming segment is discarded.
               * If the TCB exists but is in CLOSED state, it is embryonic,
               * but should either do a listen or a connect soon.
               */
              tp = NULL;
              so = NULL;
              if (inp) {
                      /* Check the minimum TTL for socket. */
                      if (ip->ip_ttl < inp->inp_ip_minttl)
                              goto drop;
      
                      tp = intotcpcb(inp);
                      so = inp->inp_socket;
              }
      #ifdef INET6
              else if (in6p) {
                      tp = in6totcpcb(in6p);
                      so = in6p->in6p_socket;
              }
      #endif
              else if (vestige.valid) {
                      /* We do not support the resurrection of vtw tcpcps. */
                      tcp_vtw_input(th, &vestige, m, tlen);
                      m = NULL;
                      goto drop;
              }
      
              if (tp == NULL)
                      goto dropwithreset_ratelim;
              if (tp->t_state == TCPS_CLOSED)
                      goto drop;
      
              KASSERT(so->so_lock == softnet_lock);
              KASSERT(solocked(so));
      
              /* Unscale the window into a 32-bit value. */
              if ((tiflags & TH_SYN) == 0)
                      tiwin = th->th_win << tp->snd_scale;
              else
                      tiwin = th->th_win;
      
      #ifdef INET6
              /* save packet options if user wanted */
              if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
                      if (in6p->in6p_options) {
                              m_freem(in6p->in6p_options);
                              in6p->in6p_options = NULL;
                      }
                      KASSERT(ip6 != NULL);
                      ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
              }
      #endif
      
              if (so->so_options & SO_DEBUG) {
      #ifdef TCP_DEBUG
                      ostate = tp->t_state;
      #endif
      
                      tcp_saveti = NULL;
                      if (iphlen + sizeof(struct tcphdr) > MHLEN)
                              goto nosave;
      
                      if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
                              tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
                              if (tcp_saveti == NULL)
                                      goto nosave;
                      } else {
                              MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
                              if (tcp_saveti == NULL)
                                      goto nosave;
                              MCLAIM(m, &tcp_mowner);
                              tcp_saveti->m_len = iphlen;
                              m_copydata(m, 0, iphlen,
                                  mtod(tcp_saveti, void *));
                      }
      
                      if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
                              m_freem(tcp_saveti);
                              tcp_saveti = NULL;
                      } else {
                              tcp_saveti->m_len += sizeof(struct tcphdr);
                              memcpy(mtod(tcp_saveti, char *) + iphlen, th,
                                  sizeof(struct tcphdr));
                      }
      nosave:;
              }
      
              if (so->so_options & SO_ACCEPTCONN) {
                      union syn_cache_sa src;
                      union syn_cache_sa dst;
      
                      KASSERT(tp->t_state == TCPS_LISTEN);
      
                      memset(&src, 0, sizeof(src));
                      memset(&dst, 0, sizeof(dst));
                      switch (af) {
                      case AF_INET:
                              src.sin.sin_len = sizeof(struct sockaddr_in);
                              src.sin.sin_family = AF_INET;
                              src.sin.sin_addr = ip->ip_src;
                              src.sin.sin_port = th->th_sport;
      
                              dst.sin.sin_len = sizeof(struct sockaddr_in);
                              dst.sin.sin_family = AF_INET;
                              dst.sin.sin_addr = ip->ip_dst;
                              dst.sin.sin_port = th->th_dport;
                              break;
      #ifdef INET6
                      case AF_INET6:
                              src.sin6.sin6_len = sizeof(struct sockaddr_in6);
                              src.sin6.sin6_family = AF_INET6;
                              src.sin6.sin6_addr = ip6->ip6_src;
                              src.sin6.sin6_port = th->th_sport;
      
                              dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
                              dst.sin6.sin6_family = AF_INET6;
                              dst.sin6.sin6_addr = ip6->ip6_dst;
                              dst.sin6.sin6_port = th->th_dport;
                              break;
      #endif
                      }
      
                      if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
                              if (tiflags & TH_RST) {
                                      syn_cache_reset(&src.sa, &dst.sa, th);
                              } else if ((tiflags & (TH_ACK|TH_SYN)) ==
                                  (TH_ACK|TH_SYN)) {
                                      /*
                                       * Received a SYN,ACK. This should never
                                       * happen while we are in LISTEN. Send an RST.
                                       */
                                      goto badsyn;
                              } else if (tiflags & TH_ACK) {
                                      so = syn_cache_get(&src.sa, &dst.sa, th, so, m);
                                      if (so == NULL) {
                                              /*
                                               * We don't have a SYN for this ACK;
                                               * send an RST.
                                               */
                                              goto badsyn;
                                      } else if (so == (struct socket *)(-1)) {
                                              /*
                                               * We were unable to create the
                                               * connection. If the 3-way handshake
                                               * was completed, and RST has been
                                               * sent to the peer. Since the mbuf
                                               * might be in use for the reply, do
                                               * not free it.
                                               */
                                              m = NULL;
                                      } else {
                                              /*
                                               * We have created a full-blown
                                               * connection.
                                               */
                                              tp = NULL;
                                              inp = NULL;
      #ifdef INET6
                                              in6p = NULL;
      #endif
                                              switch (so->so_proto->pr_domain->dom_family) {
                                              case AF_INET:
                                                      inp = sotoinpcb(so);
                                                      tp = intotcpcb(inp);
                                                      break;
      #ifdef INET6
                                              case AF_INET6:
                                                      in6p = sotoin6pcb(so);
                                                      tp = in6totcpcb(in6p);
                                                      break;
      #endif
                                              }
                                              if (tp == NULL)
                                                      goto badsyn;        /*XXX*/
                                              tiwin <<= tp->snd_scale;
                                              goto after_listen;
                                      }
                              } else {
                                      /*
                                       * None of RST, SYN or ACK was set.
                                       * This is an invalid packet for a
                                       * TCB in LISTEN state.  Send a RST.
                                       */
                                      goto badsyn;
                              }
                      } else {
                              /*
                               * Received a SYN.
                               */
      
      #ifdef INET6
                              /*
                               * If deprecated address is forbidden, we do
                               * not accept SYN to deprecated interface
                               * address to prevent any new inbound
                               * connection from getting established.
                               * When we do not accept SYN, we send a TCP
                               * RST, with deprecated source address (instead
                               * of dropping it).  We compromise it as it is
                               * much better for peer to send a RST, and
                               * RST will be the final packet for the
                               * exchange.
                               *
                               * If we do not forbid deprecated addresses, we
                               * accept the SYN packet.  RFC2462 does not
                               * suggest dropping SYN in this case.
                               * If we decipher RFC2462 5.5.4, it says like
                               * this:
                               * 1. use of deprecated addr with existing
                               *    communication is okay - "SHOULD continue
                               *    to be used"
                               * 2. use of it with new communication:
                               *   (2a) "SHOULD NOT be used if alternate
                               *        address with sufficient scope is
                               *        available"
                               *   (2b) nothing mentioned otherwise.
                               * Here we fall into (2b) case as we have no
                               * choice in our source address selection - we
                               * must obey the peer.
                               *
                               * The wording in RFC2462 is confusing, and
                               * there are multiple description text for
                               * deprecated address handling - worse, they
                               * are not exactly the same.  I believe 5.5.4
                               * is the best one, so we follow 5.5.4.
                               */
                              if (af == AF_INET6 && !ip6_use_deprecated) {
                                      struct in6_ifaddr *ia6;
                                      int s;
                                      struct ifnet *rcvif = m_get_rcvif(m, &s);
                                      if (rcvif == NULL)
                                              goto dropwithreset; /* XXX */
                                      if ((ia6 = in6ifa_ifpwithaddr(rcvif,
                                          &ip6->ip6_dst)) &&
                                          (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
                                              tp = NULL;
                                              m_put_rcvif(rcvif, &s);
                                              goto dropwithreset;
                                      }
                                      m_put_rcvif(rcvif, &s);
                              }
      #endif
      
                              /*
                               * LISTEN socket received a SYN from itself? This
                               * can't possibly be valid; drop the packet.
                               */
                              if (th->th_sport == th->th_dport) {
                                      int eq = 0;
      
                                      switch (af) {
                                      case AF_INET:
                                              eq = in_hosteq(ip->ip_src, ip->ip_dst);
                                              break;
      #ifdef INET6
                                      case AF_INET6:
                                              eq = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
                                                  &ip6->ip6_dst);
                                              break;
      #endif
                                      }
                                      if (eq) {
                                              TCP_STATINC(TCP_STAT_BADSYN);
                                              goto drop;
                                      }
                              }
      
                              /*
                               * SYN looks ok; create compressed TCP
                               * state for it.
                               */
                              if (so->so_qlen <= so->so_qlimit &&
                                  syn_cache_add(&src.sa, &dst.sa, th, off,
                                  so, m, optp, optlen, &opti))
                                      m = NULL;
                      }
      
                      goto drop;
              }
      
      after_listen:
              /*
               * From here on, we're dealing with !LISTEN.
               */
              KASSERT(tp->t_state != TCPS_LISTEN);
      
              /*
               * Segment received on connection.
               * Reset idle time and keep-alive timer.
               */
              tp->t_rcvtime = tcp_now;
              if (TCPS_HAVEESTABLISHED(tp->t_state))
                      TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
      
              /*
               * Process options.
               */
      #ifdef TCP_SIGNATURE
              if (optp || (tp->t_flags & TF_SIGNATURE))
      #else
              if (optp)
      #endif
                      if (tcp_dooptions(tp, optp, optlen, th, m, off, &opti) < 0)
                              goto drop;
      
              if (TCP_SACK_ENABLED(tp)) {
                      tcp_del_sackholes(tp, th);
              }
      
              if (TCP_ECN_ALLOWED(tp)) {
                      if (tiflags & TH_CWR) {
                              tp->t_flags &= ~TF_ECN_SND_ECE;
                      }
                      switch (iptos & IPTOS_ECN_MASK) {
                      case IPTOS_ECN_CE:
                              tp->t_flags |= TF_ECN_SND_ECE;
                              TCP_STATINC(TCP_STAT_ECN_CE);
                              break;
                      case IPTOS_ECN_ECT0:
                              TCP_STATINC(TCP_STAT_ECN_ECT);
                              break;
                      case IPTOS_ECN_ECT1:
                              /* XXX: ignore for now -- rpaulo */
                              break;
                      }
                      /*
                       * Congestion experienced.
                       * Ignore if we are already trying to recover.
                       */
                      if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
                              tp->t_congctl->cong_exp(tp);
              }
      
              if (opti.ts_present && opti.ts_ecr) {
                      /*
                       * Calculate the RTT from the returned time stamp and the
                       * connection's time base.  If the time stamp is later than
                       * the current time, or is extremely old, fall back to non-1323
                       * RTT calculation.  Since ts_rtt is unsigned, we can test both
                       * at the same time.
                       *
                       * Note that ts_rtt is in units of slow ticks (500
                       * ms).  Since most earthbound RTTs are < 500 ms,
                       * observed values will have large quantization noise.
                       * Our smoothed RTT is then the fraction of observed
                       * samples that are 1 tick instead of 0 (times 500
                       * ms).
                       *
                       * ts_rtt is increased by 1 to denote a valid sample,
                       * with 0 indicating an invalid measurement.  This
                       * extra 1 must be removed when ts_rtt is used, or
                       * else an erroneous extra 500 ms will result.
                       */
                      ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1;
                      if (ts_rtt > TCP_PAWS_IDLE)
                              ts_rtt = 0;
              } else {
                      ts_rtt = 0;
              }
      
              /*
               * Fast path: check for the two common cases of a uni-directional
               * data transfer. If:
               *    o We are in the ESTABLISHED state, and
               *    o The packet has no control flags, and
               *    o The packet is in-sequence, and
               *    o The window didn't change, and
               *    o We are not retransmitting
               * It's a candidate.
               *
               * If the length (tlen) is zero and the ack moved forward, we're
               * the sender side of the transfer. Just free the data acked and
               * wake any higher level process that was blocked waiting for
               * space.
               *
               * If the length is non-zero and the ack didn't move, we're the
               * receiver side. If we're getting packets in-order (the reassembly
               * queue is empty), add the data to the socket buffer and note
               * that we need a delayed ack.
               */
              if (tp->t_state == TCPS_ESTABLISHED &&
                  (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK))
                      == TH_ACK &&
                  (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
                  th->th_seq == tp->rcv_nxt &&
                  tiwin && tiwin == tp->snd_wnd &&
                  tp->snd_nxt == tp->snd_max) {
      
                      /*
                       * If last ACK falls within this segment's sequence numbers,
                       * record the timestamp.
                       * NOTE that the test is modified according to the latest
                       * proposal of the tcplw@cray.com list (Braden 1993/04/26).
                       *
                       * note that we already know
                       *        TSTMP_GEQ(opti.ts_val, tp->ts_recent)
                       */
                      if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
                              tp->ts_recent_age = tcp_now;
                              tp->ts_recent = opti.ts_val;
                      }
      
                      if (tlen == 0) {
                              /* Ack prediction. */
                              if (SEQ_GT(th->th_ack, tp->snd_una) &&
                                  SEQ_LEQ(th->th_ack, tp->snd_max) &&
                                  tp->snd_cwnd >= tp->snd_wnd &&
                                  tp->t_partialacks < 0) {
                                      /*
                                       * this is a pure ack for outstanding data.
                                       */
                                      if (ts_rtt)
                                              tcp_xmit_timer(tp, ts_rtt - 1);
                                      else if (tp->t_rtttime &&
                                          SEQ_GT(th->th_ack, tp->t_rtseq))
                                              tcp_xmit_timer(tp,
                                                tcp_now - tp->t_rtttime);
                                      acked = th->th_ack - tp->snd_una;
                                      tcps = TCP_STAT_GETREF();
                                      tcps[TCP_STAT_PREDACK]++;
                                      tcps[TCP_STAT_RCVACKPACK]++;
                                      tcps[TCP_STAT_RCVACKBYTE] += acked;
                                      TCP_STAT_PUTREF();
                                      nd6_hint(tp);
      
                                      if (acked > (tp->t_lastoff - tp->t_inoff))
                                              tp->t_lastm = NULL;
                                      sbdrop(&so->so_snd, acked);
                                      tp->t_lastoff -= acked;
      
                                      icmp_check(tp, th, acked);
      
                                      tp->snd_una = th->th_ack;
                                      tp->snd_fack = tp->snd_una;
                                      if (SEQ_LT(tp->snd_high, tp->snd_una))
                                              tp->snd_high = tp->snd_una;
                                      m_freem(m);
      
                                      /*
                                       * If all outstanding data are acked, stop
                                       * retransmit timer, otherwise restart timer
                                       * using current (possibly backed-off) value.
                                       * If process is waiting for space,
                                       * wakeup/selnotify/signal.  If data
                                       * are ready to send, let tcp_output
                                       * decide between more output or persist.
                                       */
                                      if (tp->snd_una == tp->snd_max)
                                              TCP_TIMER_DISARM(tp, TCPT_REXMT);
                                      else if (TCP_TIMER_ISARMED(tp,
                                          TCPT_PERSIST) == 0)
                                              TCP_TIMER_ARM(tp, TCPT_REXMT,
                                                  tp->t_rxtcur);
      
                                      sowwakeup(so);
                                      if (so->so_snd.sb_cc) {
                                              KERNEL_LOCK(1, NULL);
                                              (void)tcp_output(tp);
                                              KERNEL_UNLOCK_ONE(NULL);
                                      }
                                      if (tcp_saveti)
                                              m_freem(tcp_saveti);
                                      return;
                              }
                      } else if (th->th_ack == tp->snd_una &&
                          TAILQ_FIRST(&tp->segq) == NULL &&
                          tlen <= sbspace(&so->so_rcv)) {
                              int newsize = 0;
      
                              /*
                               * this is a pure, in-sequence data packet
                               * with nothing on the reassembly queue and
                               * we have enough buffer space to take it.
                               */
                              tp->rcv_nxt += tlen;
                              tcps = TCP_STAT_GETREF();
                              tcps[TCP_STAT_PREDDAT]++;
                              tcps[TCP_STAT_RCVPACK]++;
                              tcps[TCP_STAT_RCVBYTE] += tlen;
                              TCP_STAT_PUTREF();
                              nd6_hint(tp);
      
                      /*
                       * Automatic sizing enables the performance of large buffers
                       * and most of the efficiency of small ones by only allocating
                       * space when it is needed.
                       *
                       * On the receive side the socket buffer memory is only rarely
                       * used to any significant extent.  This allows us to be much
                       * more aggressive in scaling the receive socket buffer.  For
                       * the case that the buffer space is actually used to a large
                       * extent and we run out of kernel memory we can simply drop
                       * the new segments; TCP on the sender will just retransmit it
                       * later.  Setting the buffer size too big may only consume too
                       * much kernel memory if the application doesn't read() from
                       * the socket or packet loss or reordering makes use of the
                       * reassembly queue.
                       *
                       * The criteria to step up the receive buffer one notch are:
                       *  1. the number of bytes received during the time it takes
                       *     one timestamp to be reflected back to us (the RTT);
                       *  2. received bytes per RTT is within seven eighth of the
                       *     current socket buffer size;
                       *  3. receive buffer size has not hit maximal automatic size;
                       *
                       * This algorithm does one step per RTT at most and only if
                       * we receive a bulk stream w/o packet losses or reorderings.
                       * Shrinking the buffer during idle times is not necessary as
                       * it doesn't consume any memory when idle.
                       *
                       * TODO: Only step up if the application is actually serving
                       * the buffer to better manage the socket buffer resources.
                       */
                              if (tcp_do_autorcvbuf &&
                                  opti.ts_ecr &&
                                  (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
                                      if (opti.ts_ecr > tp->rfbuf_ts &&
                                          opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) {
                                              if (tp->rfbuf_cnt >
                                                  (so->so_rcv.sb_hiwat / 8 * 7) &&
                                                  so->so_rcv.sb_hiwat <
                                                  tcp_autorcvbuf_max) {
                                                      newsize =
                                                          uimin(so->so_rcv.sb_hiwat +
                                                          tcp_autorcvbuf_inc,
                                                          tcp_autorcvbuf_max);
                                              }
                                              /* Start over with next RTT. */
                                              tp->rfbuf_ts = 0;
                                              tp->rfbuf_cnt = 0;
                                      } else
                                              tp->rfbuf_cnt += tlen;        /* add up */
                              }
      
                              /*
                               * Drop TCP, IP headers and TCP options then add data
                               * to socket buffer.
                               */
                              if (so->so_state & SS_CANTRCVMORE) {
                                      m_freem(m);
                              } else {
                                      /*
                                       * Set new socket buffer size.
                                       * Give up when limit is reached.
                                       */
                                      if (newsize)
                                              if (!sbreserve(&so->so_rcv,
                                                  newsize, so))
                                                      so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
                                      m_adj(m, off + thlen);
                                      sbappendstream(&so->so_rcv, m);
                              }
                              sorwakeup(so);
                              tcp_setup_ack(tp, th);
                              if (tp->t_flags & TF_ACKNOW) {
                                      KERNEL_LOCK(1, NULL);
                                      (void)tcp_output(tp);
                                      KERNEL_UNLOCK_ONE(NULL);
                              }
                              if (tcp_saveti)
                                      m_freem(tcp_saveti);
                              return;
                      }
              }
      
              /*
               * Compute mbuf offset to TCP data segment.
               */
              hdroptlen = off + thlen;
      
              /*
               * Calculate amount of space in receive window. Receive window is
               * amount of space in rcv queue, but not less than advertised
               * window.
               */
              {
                      int win;
                      win = sbspace(&so->so_rcv);
                      if (win < 0)
                              win = 0;
                      tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
              }
      
              /* Reset receive buffer auto scaling when not in bulk receive mode. */
              tp->rfbuf_ts = 0;
              tp->rfbuf_cnt = 0;
      
              switch (tp->t_state) {
              /*
               * If the state is SYN_SENT:
               *        if seg contains an ACK, but not for our SYN, drop the input.
               *        if seg contains a RST, then drop the connection.
               *        if seg does not contain SYN, then drop it.
               * Otherwise this is an acceptable SYN segment
               *        initialize tp->rcv_nxt and tp->irs
               *        if seg contains ack then advance tp->snd_una
               *        if seg contains a ECE and ECN support is enabled, the stream
               *            is ECN capable.
               *        if SYN has been acked change to ESTABLISHED else SYN_RCVD state
               *        arrange for segment to be acked (eventually)
               *        continue processing rest of data/controls, beginning with URG
               */
              case TCPS_SYN_SENT:
                      if ((tiflags & TH_ACK) &&
                          (SEQ_LEQ(th->th_ack, tp->iss) ||
                           SEQ_GT(th->th_ack, tp->snd_max)))
                              goto dropwithreset;
                      if (tiflags & TH_RST) {
                              if (tiflags & TH_ACK)
                                      tp = tcp_drop(tp, ECONNREFUSED);
                              goto drop;
                      }
                      if ((tiflags & TH_SYN) == 0)
                              goto drop;
                      if (tiflags & TH_ACK) {
                              tp->snd_una = th->th_ack;
                              if (SEQ_LT(tp->snd_nxt, tp->snd_una))
                                      tp->snd_nxt = tp->snd_una;
                              if (SEQ_LT(tp->snd_high, tp->snd_una))
                                      tp->snd_high = tp->snd_una;
                              TCP_TIMER_DISARM(tp, TCPT_REXMT);
      
                              if ((tiflags & TH_ECE) && tcp_do_ecn) {
                                      tp->t_flags |= TF_ECN_PERMIT;
                                      TCP_STATINC(TCP_STAT_ECN_SHS);
                              }
                      }
                      tp->irs = th->th_seq;
                      tcp_rcvseqinit(tp);
                      tp->t_flags |= TF_ACKNOW;
                      tcp_mss_from_peer(tp, opti.maxseg);
      
                      /*
                       * Initialize the initial congestion window.  If we
                       * had to retransmit the SYN, we must initialize cwnd
                       * to 1 segment (i.e. the Loss Window).
                       */
                      if (tp->t_flags & TF_SYN_REXMT)
                              tp->snd_cwnd = tp->t_peermss;
                      else {
                              int ss = tcp_init_win;
                              if (inp != NULL && in_localaddr(inp->inp_faddr))
                                      ss = tcp_init_win_local;
      #ifdef INET6
                              if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
                                      ss = tcp_init_win_local;
      #endif
                              tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
                      }
      
                      tcp_rmx_rtt(tp);
                      if (tiflags & TH_ACK) {
                              TCP_STATINC(TCP_STAT_CONNECTS);
                              /*
                               * move tcp_established before soisconnected
                               * because upcall handler can drive tcp_output
                               * functionality.
                               * XXX we might call soisconnected at the end of
                               * all processing
                               */
                              tcp_established(tp);
                              soisconnected(so);
                              /* Do window scaling on this connection? */
                              if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
                                  (TF_RCVD_SCALE|TF_REQ_SCALE)) {
                                      tp->snd_scale = tp->requested_s_scale;
                                      tp->rcv_scale = tp->request_r_scale;
                              }
                              TCP_REASS_LOCK(tp);
                              (void)tcp_reass(tp, NULL, NULL, tlen);
                              /*
                               * if we didn't have to retransmit the SYN,
                               * use its rtt as our initial srtt & rtt var.
                               */
                              if (tp->t_rtttime)
                                      tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
                      } else {
                              tp->t_state = TCPS_SYN_RECEIVED;
                      }
      
                      /*
                       * Advance th->th_seq to correspond to first data byte.
                       * If data, trim to stay within window,
                       * dropping FIN if necessary.
                       */
                      th->th_seq++;
                      if (tlen > tp->rcv_wnd) {
                              todrop = tlen - tp->rcv_wnd;
                              m_adj(m, -todrop);
                              tlen = tp->rcv_wnd;
                              tiflags &= ~TH_FIN;
                              tcps = TCP_STAT_GETREF();
                              tcps[TCP_STAT_RCVPACKAFTERWIN]++;
                              tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop;
                              TCP_STAT_PUTREF();
                      }
                      tp->snd_wl1 = th->th_seq - 1;
                      tp->rcv_up = th->th_seq;
                      goto step6;
      
              /*
               * If the state is SYN_RECEIVED:
               *        If seg contains an ACK, but not for our SYN, drop the input
               *        and generate an RST.  See page 36, rfc793
               */
              case TCPS_SYN_RECEIVED:
                      if ((tiflags & TH_ACK) &&
                          (SEQ_LEQ(th->th_ack, tp->iss) ||
                           SEQ_GT(th->th_ack, tp->snd_max)))
                              goto dropwithreset;
                      break;
              }
      
              /*
               * From here on, we're dealing with !LISTEN and !SYN_SENT.
               */
              KASSERT(tp->t_state != TCPS_LISTEN &&
                  tp->t_state != TCPS_SYN_SENT);
      
              /*
               * RFC1323 PAWS: if we have a timestamp reply on this segment and
               * it's less than ts_recent, drop it.
               */
              if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
                  TSTMP_LT(opti.ts_val, tp->ts_recent)) {
                      /* Check to see if ts_recent is over 24 days old.  */
                      if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
                              /*
                               * Invalidate ts_recent.  If this segment updates
                               * ts_recent, the age will be reset later and ts_recent
                               * will get a valid value.  If it does not, setting
                               * ts_recent to zero will at least satisfy the
                               * requirement that zero be placed in the timestamp
                               * echo reply when ts_recent isn't valid.  The
                               * age isn't reset until we get a valid ts_recent
                               * because we don't want out-of-order segments to be
                               * dropped when ts_recent is old.
                               */
                              tp->ts_recent = 0;
                      } else {
                              tcps = TCP_STAT_GETREF();
                              tcps[TCP_STAT_RCVDUPPACK]++;
                              tcps[TCP_STAT_RCVDUPBYTE] += tlen;
                              tcps[TCP_STAT_PAWSDROP]++;
                              TCP_STAT_PUTREF();
                              tcp_new_dsack(tp, th->th_seq, tlen);
                              goto dropafterack;
                      }
              }
      
              /*
               * Check that at least some bytes of the segment are within the
               * receive window. If segment begins before rcv_nxt, drop leading
               * data (and SYN); if nothing left, just ack.
               */
              todrop = tp->rcv_nxt - th->th_seq;
              dupseg = false;
              if (todrop > 0) {
                      if (tiflags & TH_SYN) {
                              tiflags &= ~TH_SYN;
                              th->th_seq++;
                              tcp_urp_drop(th, 1, &tiflags);
                              todrop--;
                      }
                      if (todrop > tlen ||
                          (todrop == tlen && (tiflags & TH_FIN) == 0)) {
                              /*
                               * Any valid FIN or RST must be to the left of the
                               * window.  At this point the FIN or RST must be a
                               * duplicate or out of sequence; drop it.
                               */
                              if (tiflags & TH_RST)
                                      goto drop;
                              tiflags &= ~(TH_FIN|TH_RST);
      
                              /*
                               * Send an ACK to resynchronize and drop any data.
                               * But keep on processing for RST or ACK.
                               */
                              tp->t_flags |= TF_ACKNOW;
                              todrop = tlen;
                              dupseg = true;
                              tcps = TCP_STAT_GETREF();
                              tcps[TCP_STAT_RCVDUPPACK]++;
                              tcps[TCP_STAT_RCVDUPBYTE] += todrop;
                              TCP_STAT_PUTREF();
                      } else if ((tiflags & TH_RST) && th->th_seq != tp->rcv_nxt) {
                              /*
                               * Test for reset before adjusting the sequence
                               * number for overlapping data.
                               */
                              goto dropafterack_ratelim;
                      } else {
                              tcps =