/*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice unmodified, this list of conditions, and the following
       *    disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * This file implements the ULE scheduler.  ULE supports independent CPU
       * run queues and fine grain locking.  It has superior interactive
       * performance under load even on uni-processor systems.
       *
       * etymology:
       *   ULE is the last three letters in schedule.  It owes its name to a
       * generic user created for a scheduling system by Paul Mikesell at
       * Isilon Systems and a general lack of creativity on the part of the author.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_hwpmc_hooks.h"
      #include "opt_sched.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kdb.h>
      #include <sys/kernel.h>
      #include <sys/ktr.h>
      #include <sys/limits.h>
      #include <sys/lock.h>
      #include <sys/mutex.h>
      #include <sys/proc.h>
      #include <sys/resource.h>
      #include <sys/resourcevar.h>
      #include <sys/sched.h>
      #include <sys/sdt.h>
      #include <sys/smp.h>
      #include <sys/sx.h>
      #include <sys/sysctl.h>
      #include <sys/sysproto.h>
      #include <sys/turnstile.h>
      #include <sys/umtx.h>
      #include <sys/vmmeter.h>
      #include <sys/cpuset.h>
      #include <sys/sbuf.h>
      
      #ifdef HWPMC_HOOKS
      #include <sys/pmckern.h>
      #endif
      
      #ifdef KDTRACE_HOOKS
      #include <sys/dtrace_bsd.h>
      int __read_mostly                dtrace_vtime_active;
      dtrace_vtime_switch_func_t        dtrace_vtime_switch_func;
      #endif
      
      #include <machine/cpu.h>
      #include <machine/smp.h>
      
      #define        KTR_ULE        0
      
      #define        TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
      #define        TDQ_NAME_LEN        (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
      #define        TDQ_LOADNAME_LEN        (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
      
      /*
       * Thread scheduler specific section.  All fields are protected
       * by the thread lock.
       */
      struct td_sched {        
              struct runq        *ts_runq;        /* Run-queue we're queued on. */
              short                ts_flags;        /* TSF_* flags. */
              int                ts_cpu;                /* CPU that we have affinity for. */
              int                ts_rltick;        /* Real last tick, for affinity. */
              int                ts_slice;        /* Ticks of slice remaining. */
              u_int                ts_slptime;        /* Number of ticks we vol. slept */
              u_int                ts_runtime;        /* Number of ticks we were running */
              int                ts_ltick;        /* Last tick that we were running on */
              int                ts_ftick;        /* First tick that we were running on */
              int                ts_ticks;        /* Tick count */
      #ifdef KTR
              char                ts_name[TS_NAME_LEN];
      #endif
      };
      /* flags kept in ts_flags */
      #define        TSF_BOUND        0x0001                /* Thread can not migrate. */
      #define        TSF_XFERABLE        0x0002                /* Thread was added as transferable. */
      
      #define        THREAD_CAN_MIGRATE(td)        ((td)->td_pinned == 0)
      #define        THREAD_CAN_SCHED(td, cpu)        \
          CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
      
      _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
          sizeof(struct thread0_storage),
          "increase struct thread0_storage.t0st_sched size");
      
      /*
       * Priority ranges used for interactive and non-interactive timeshare
       * threads.  The timeshare priorities are split up into four ranges.
       * The first range handles interactive threads.  The last three ranges
       * (NHALF, x, and NHALF) handle non-interactive threads with the outer
       * ranges supporting nice values.
       */
      #define        PRI_TIMESHARE_RANGE        (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
      #define        PRI_INTERACT_RANGE        ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
      #define        PRI_BATCH_RANGE                (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
      
      #define        PRI_MIN_INTERACT        PRI_MIN_TIMESHARE
      #define        PRI_MAX_INTERACT        (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
      #define        PRI_MIN_BATCH                (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
      #define        PRI_MAX_BATCH                PRI_MAX_TIMESHARE
      
      /*
       * Cpu percentage computation macros and defines.
       *
       * SCHED_TICK_SECS:        Number of seconds to average the cpu usage across.
       * SCHED_TICK_TARG:        Number of hz ticks to average the cpu usage across.
       * SCHED_TICK_MAX:        Maximum number of ticks before scaling back.
       * SCHED_TICK_SHIFT:        Shift factor to avoid rounding away results.
       * SCHED_TICK_HZ:        Compute the number of hz ticks for a given ticks count.
       * SCHED_TICK_TOTAL:        Gives the amount of time we've been recording ticks.
       */
      #define        SCHED_TICK_SECS                10
      #define        SCHED_TICK_TARG                (hz * SCHED_TICK_SECS)
      #define        SCHED_TICK_MAX                (SCHED_TICK_TARG + hz)
      #define        SCHED_TICK_SHIFT        10
      #define        SCHED_TICK_HZ(ts)        ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
      #define        SCHED_TICK_TOTAL(ts)        (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
      
      /*
       * These macros determine priorities for non-interactive threads.  They are
       * assigned a priority based on their recent cpu utilization as expressed
       * by the ratio of ticks to the tick total.  NHALF priorities at the start
       * and end of the MIN to MAX timeshare range are only reachable with negative
       * or positive nice respectively.
       *
       * PRI_RANGE:        Priority range for utilization dependent priorities.
       * PRI_NRESV:        Number of nice values.
       * PRI_TICKS:        Compute a priority in PRI_RANGE from the ticks count and total.
       * PRI_NICE:        Determines the part of the priority inherited from nice.
       */
      #define        SCHED_PRI_NRESV                (PRIO_MAX - PRIO_MIN)
      #define        SCHED_PRI_NHALF                (SCHED_PRI_NRESV / 2)
      #define        SCHED_PRI_MIN                (PRI_MIN_BATCH + SCHED_PRI_NHALF)
      #define        SCHED_PRI_MAX                (PRI_MAX_BATCH - SCHED_PRI_NHALF)
      #define        SCHED_PRI_RANGE                (SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
      #define        SCHED_PRI_TICKS(ts)                                                \
          (SCHED_TICK_HZ((ts)) /                                                \
          (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
      #define        SCHED_PRI_NICE(nice)        (nice)
      
      /*
       * These determine the interactivity of a process.  Interactivity differs from
       * cpu utilization in that it expresses the voluntary time slept vs time ran
       * while cpu utilization includes all time not running.  This more accurately
       * models the intent of the thread.
       *
       * SLP_RUN_MAX:        Maximum amount of sleep time + run time we'll accumulate
       *                before throttling back.
       * SLP_RUN_FORK:        Maximum slp+run time to inherit at fork time.
       * INTERACT_MAX:        Maximum interactivity value.  Smaller is better.
       * INTERACT_THRESH:        Threshold for placement on the current runq.
       */
      #define        SCHED_SLP_RUN_MAX        ((hz * 5) << SCHED_TICK_SHIFT)
      #define        SCHED_SLP_RUN_FORK        ((hz / 2) << SCHED_TICK_SHIFT)
      #define        SCHED_INTERACT_MAX        (100)
      #define        SCHED_INTERACT_HALF        (SCHED_INTERACT_MAX / 2)
      #define        SCHED_INTERACT_THRESH        (30)
      
      /*
       * These parameters determine the slice behavior for batch work.
       */
      #define        SCHED_SLICE_DEFAULT_DIVISOR        10        /* ~94 ms, 12 stathz ticks. */
      #define        SCHED_SLICE_MIN_DIVISOR                6        /* DEFAULT/MIN = ~16 ms. */
      
      /* Flags kept in td_flags. */
      #define        TDF_SLICEEND        TDF_SCHED2        /* Thread time slice is over. */
      
      /*
       * tickincr:                Converts a stathz tick into a hz domain scaled by
       *                        the shift factor.  Without the shift the error rate
       *                        due to rounding would be unacceptably high.
       * realstathz:                stathz is sometimes 0 and run off of hz.
       * sched_slice:                Runtime of each thread before rescheduling.
       * preempt_thresh:        Priority threshold for preemption and remote IPIs.
       */
      static int __read_mostly sched_interact = SCHED_INTERACT_THRESH;
      static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT;
      static int __read_mostly realstathz = 127;        /* reset during boot. */
      static int __read_mostly sched_slice = 10;        /* reset during boot. */
      static int __read_mostly sched_slice_min = 1;        /* reset during boot. */
      #ifdef PREEMPTION
      #ifdef FULL_PREEMPTION
      static int __read_mostly preempt_thresh = PRI_MAX_IDLE;
      #else
      static int __read_mostly preempt_thresh = PRI_MIN_KERN;
      #endif
      #else 
      static int __read_mostly preempt_thresh = 0;
      #endif
      static int __read_mostly static_boost = PRI_MIN_BATCH;
      static int __read_mostly sched_idlespins = 10000;
      static int __read_mostly sched_idlespinthresh = -1;
      
      /*
       * tdq - per processor runqs and statistics.  All fields are protected by the
       * tdq_lock.  The load and lowpri may be accessed without to avoid excess
       * locking in sched_pickcpu();
       */
      struct tdq {
              /* 
               * Ordered to improve efficiency of cpu_search() and switch().
               * tdq_lock is padded to avoid false sharing with tdq_load and
               * tdq_cpu_idle.
               */
              struct mtx_padalign tdq_lock;                /* run queue lock. */
              struct cpu_group *tdq_cg;                /* Pointer to cpu topology. */
              volatile int        tdq_load;                /* Aggregate load. */
              volatile int        tdq_cpu_idle;                /* cpu_idle() is active. */
              int                tdq_sysload;                /* For loadavg, !ITHD load. */
              volatile int        tdq_transferable;        /* Transferable thread count. */
              volatile short        tdq_switchcnt;                /* Switches this tick. */
              volatile short        tdq_oldswitchcnt;        /* Switches last tick. */
              u_char                tdq_lowpri;                /* Lowest priority thread. */
              u_char                tdq_owepreempt;                /* Remote preemption pending. */
              u_char                tdq_idx;                /* Current insert index. */
              u_char                tdq_ridx;                /* Current removal index. */
              int                tdq_id;                        /* cpuid. */
              struct runq        tdq_realtime;                /* real-time run queue. */
              struct runq        tdq_timeshare;                /* timeshare run queue. */
              struct runq        tdq_idle;                /* Queue of IDLE threads. */
              char                tdq_name[TDQ_NAME_LEN];
      #ifdef KTR
              char                tdq_loadname[TDQ_LOADNAME_LEN];
      #endif
      } __aligned(64);
      
      /* Idle thread states and config. */
      #define        TDQ_RUNNING        1
      #define        TDQ_IDLE        2
      
      #ifdef SMP
      struct cpu_group __read_mostly *cpu_top;                /* CPU topology */
      
      #define        SCHED_AFFINITY_DEFAULT        (max(1, hz / 1000))
      #define        SCHED_AFFINITY(ts, t)        ((ts)->ts_rltick > ticks - ((t) * affinity))
      
      /*
       * Run-time tunables.
       */
      static int rebalance = 1;
      static int balance_interval = 128;        /* Default set in sched_initticks(). */
      static int __read_mostly affinity;
      static int __read_mostly steal_idle = 1;
      static int __read_mostly steal_thresh = 2;
      static int __read_mostly always_steal = 0;
      static int __read_mostly trysteal_limit = 2;
      
      /*
       * One thread queue per processor.
       */
      static struct tdq __read_mostly *balance_tdq;
      static int balance_ticks;
      DPCPU_DEFINE_STATIC(struct tdq, tdq);
      DPCPU_DEFINE_STATIC(uint32_t, randomval);
      
      #define        TDQ_SELF()        ((struct tdq *)PCPU_GET(sched))
      #define        TDQ_CPU(x)        (DPCPU_ID_PTR((x), tdq))
      #define        TDQ_ID(x)        ((x)->tdq_id)
      #else        /* !SMP */
      static struct tdq        tdq_cpu;
      
      #define        TDQ_ID(x)        (0)
      #define        TDQ_SELF()        (&tdq_cpu)
      #define        TDQ_CPU(x)        (&tdq_cpu)
      #endif
      
      #define        TDQ_LOCK_ASSERT(t, type)        mtx_assert(TDQ_LOCKPTR((t)), (type))
      #define        TDQ_LOCK(t)                mtx_lock_spin(TDQ_LOCKPTR((t)))
      #define        TDQ_LOCK_FLAGS(t, f)        mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
      #define        TDQ_UNLOCK(t)                mtx_unlock_spin(TDQ_LOCKPTR((t)))
      #define        TDQ_LOCKPTR(t)                ((struct mtx *)(&(t)->tdq_lock))
      
      static void sched_priority(struct thread *);
      static void sched_thread_priority(struct thread *, u_char);
      static int sched_interact_score(struct thread *);
      static void sched_interact_update(struct thread *);
      static void sched_interact_fork(struct thread *);
      static void sched_pctcpu_update(struct td_sched *, int);
      
      /* Operations on per processor queues */
      static struct thread *tdq_choose(struct tdq *);
      static void tdq_setup(struct tdq *, int i);
      static void tdq_load_add(struct tdq *, struct thread *);
      static void tdq_load_rem(struct tdq *, struct thread *);
      static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
      static __inline void tdq_runq_rem(struct tdq *, struct thread *);
      static inline int sched_shouldpreempt(int, int, int);
      void tdq_print(int cpu);
      static void runq_print(struct runq *rq);
      static void tdq_add(struct tdq *, struct thread *, int);
      #ifdef SMP
      static struct thread *tdq_move(struct tdq *, struct tdq *);
      static int tdq_idled(struct tdq *);
      static void tdq_notify(struct tdq *, struct thread *);
      static struct thread *tdq_steal(struct tdq *, int);
      static struct thread *runq_steal(struct runq *, int);
      static int sched_pickcpu(struct thread *, int);
      static void sched_balance(void);
      static int sched_balance_pair(struct tdq *, struct tdq *);
      static inline struct tdq *sched_setcpu(struct thread *, int, int);
      static inline void thread_unblock_switch(struct thread *, struct mtx *);
      static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
      static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 
          struct cpu_group *cg, int indent);
      #endif
      
      static void sched_setup(void *dummy);
      SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
      
      static void sched_initticks(void *dummy);
      SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
          NULL);
      
      SDT_PROVIDER_DEFINE(sched);
      
      SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
          "struct proc *", "uint8_t");
      SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
          "struct proc *", "void *");
      SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
          "struct proc *", "void *", "int");
      SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
          "struct proc *", "uint8_t", "struct thread *");
      SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
      SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", 
          "struct proc *");
      SDT_PROBE_DEFINE(sched, , , on__cpu);
      SDT_PROBE_DEFINE(sched, , , remain__cpu);
      SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", 
          "struct proc *");
      
      /*
       * Print the threads waiting on a run-queue.
       */
      static void
      runq_print(struct runq *rq)
      {
              struct rqhead *rqh;
              struct thread *td;
              int pri;
              int j;
              int i;
      
              for (i = 0; i < RQB_LEN; i++) {
                      printf("\t\trunq bits %d 0x%zx\n",
                          i, rq->rq_status.rqb_bits[i]);
                      for (j = 0; j < RQB_BPW; j++)
                              if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
                                      pri = j + (i << RQB_L2BPW);
                                      rqh = &rq->rq_queues[pri];
                                      TAILQ_FOREACH(td, rqh, td_runq) {
                                              printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
                                                  td, td->td_name, td->td_priority,
                                                  td->td_rqindex, pri);
                                      }
                              }
              }
      }
      
      /*
       * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
       */
      void
      tdq_print(int cpu)
      {
              struct tdq *tdq;
      
              tdq = TDQ_CPU(cpu);
      
              printf("tdq %d:\n", TDQ_ID(tdq));
              printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
              printf("\tLock name:      %s\n", tdq->tdq_name);
              printf("\tload:           %d\n", tdq->tdq_load);
              printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
              printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
              printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
              printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
              printf("\tload transferable: %d\n", tdq->tdq_transferable);
              printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
              printf("\trealtime runq:\n");
              runq_print(&tdq->tdq_realtime);
              printf("\ttimeshare runq:\n");
              runq_print(&tdq->tdq_timeshare);
              printf("\tidle runq:\n");
              runq_print(&tdq->tdq_idle);
      }
      
      static inline int
      sched_shouldpreempt(int pri, int cpri, int remote)
      {
              /*
               * If the new priority is not better than the current priority there is
               * nothing to do.
               */
  766         if (pri >= cpri)
                      return (0);
              /*
               * Always preempt idle.
               */
 2680         if (cpri >= PRI_MIN_IDLE)
                      return (1);
              /*
               * If preemption is disabled don't preempt others.
               */
 5170         if (preempt_thresh == 0)
                      return (0);
              /*
               * Preempt if we exceed the threshold.
               */
  627         if (pri <= preempt_thresh)
                      return (1);
              /*
               * If we're interactive or better and there is non-interactive
               * or worse running preempt only remote processors.
               */
              if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
                      return (1);
              return (0);
      }
      
      /*
       * Add a thread to the actual run-queue.  Keeps transferable counts up to
       * date with what is actually on the run-queue.  Selects the correct
       * queue position for timeshare threads.
       */
      static __inline void
      tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
 14063 {
              struct td_sched *ts;
              u_char pri;
      
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 14060         THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
      
              pri = td->td_priority;
              ts = td_get_sched(td);
              TD_SET_RUNQ(td);
 1927         if (THREAD_CAN_MIGRATE(td)) {
                      tdq->tdq_transferable++;
                      ts->ts_flags |= TSF_XFERABLE;
              }
              if (pri < PRI_MIN_BATCH) {
 3393                 ts->ts_runq = &tdq->tdq_realtime;
              } else if (pri <= PRI_MAX_BATCH) {
                      ts->ts_runq = &tdq->tdq_timeshare;
                      KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
                              ("Invalid priority %d on timeshare runq", pri));
                      /*
                       * This queue contains only priorities between MIN and MAX
                       * realtime.  Use the whole queue to represent these values.
                       */
                      if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
                              pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
                              pri = (pri + tdq->tdq_idx) % RQ_NQS;
                              /*
                               * This effectively shortens the queue by one so we
                               * can have a one slot difference between idx and
                               * ridx while we wait for threads to drain.
                               */
 12305                         if (tdq->tdq_ridx != tdq->tdq_idx &&
                                  pri == tdq->tdq_ridx)
                                      pri = (unsigned char)(pri - 1) % RQ_NQS;
                      } else
 4416                         pri = tdq->tdq_ridx;
                      runq_add_pri(ts->ts_runq, td, pri, flags);
                      return;
              } else
                      ts->ts_runq = &tdq->tdq_idle;
              runq_add(ts->ts_runq, td, flags);
      }
      
      /* 
       * Remove a thread from a run-queue.  This typically happens when a thread
       * is selected to run.  Running threads are not on the queue and the
       * transferable count does not reflect them.
       */
      static __inline void
      tdq_runq_rem(struct tdq *tdq, struct thread *td)
 23292 {
              struct td_sched *ts;
      
              ts = td_get_sched(td);
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 23321         THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
              KASSERT(ts->ts_runq != NULL,
                  ("tdq_runq_remove: thread %p null ts_runq", td));
 1300         if (ts->ts_flags & TSF_XFERABLE) {
 23261                 tdq->tdq_transferable--;
                      ts->ts_flags &= ~TSF_XFERABLE;
              }
              if (ts->ts_runq == &tdq->tdq_timeshare) {
                      if (tdq->tdq_idx != tdq->tdq_ridx)
 3072                         runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
                      else
 22106                         runq_remove_idx(ts->ts_runq, td, NULL);
              } else
 6898                 runq_remove(ts->ts_runq, td);
      }
      
      /*
       * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
       * for this thread to the referenced thread queue.
       */
      static void
      tdq_load_add(struct tdq *tdq, struct thread *td)
      {
      
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 9659         THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
      
              tdq->tdq_load++;
              if ((td->td_flags & TDF_NOLOAD) == 0)
 9659                 tdq->tdq_sysload++;
              KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 9658         SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
      }
      
      /*
       * Remove the load from a thread that is transitioning to a sleep state or
       * exiting.
       */
      static void
      tdq_load_rem(struct tdq *tdq, struct thread *td)
 19526 {
      
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 19468         THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
              KASSERT(tdq->tdq_load != 0,
                  ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
      
              tdq->tdq_load--;
              if ((td->td_flags & TDF_NOLOAD) == 0)
 19448                 tdq->tdq_sysload--;
              KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 19291         SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
      }
      
      /*
       * Bound timeshare latency by decreasing slice size as load increases.  We
       * consider the maximum latency as the sum of the threads waiting to run
       * aside from curthread and target no more than sched_slice latency but
       * no less than sched_slice_min runtime.
       */
      static inline int
      tdq_slice(struct tdq *tdq)
      {
              int load;
      
              /*
               * It is safe to use sys_load here because this is called from
               * contexts where timeshare threads are running and so there
               * cannot be higher priority load in the system.
               */
              load = tdq->tdq_sysload - 1;
              if (load >= SCHED_SLICE_MIN_DIVISOR)
                      return (sched_slice_min);
              if (load <= 1)
                      return (sched_slice);
              return (sched_slice / load);
      }
      
      /*
       * Set lowpri to its exact value by searching the run-queue and
       * evaluating curthread.  curthread may be passed as an optimization.
       */
      static void
      tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
      {
              struct thread *td;
      
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 6638         if (ctd == NULL)
                      ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
              td = tdq_choose(tdq);
 7378         if (td == NULL || td->td_priority > ctd->td_priority)
 1290                 tdq->tdq_lowpri = ctd->td_priority;
              else
 2498                 tdq->tdq_lowpri = td->td_priority;
      }
      
      #ifdef SMP
      /*
       * We need some randomness. Implement a classic Linear Congruential
       * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for
       * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits
       * of the random state (in the low bits of our answer) to keep
       * the maximum randomness.
       */
      static uint32_t
      sched_random(void)
      {
              uint32_t *rndptr;
      
              rndptr = DPCPU_PTR(randomval);
              *rndptr = *rndptr * 69069 + 5;
      
              return (*rndptr >> 16);
      }
      
      struct cpu_search {
              cpuset_t cs_mask;
              u_int        cs_prefer;
              int        cs_pri;                /* Min priority for low. */
              int        cs_limit;        /* Max load for low, min load for high. */
              int        cs_cpu;
              int        cs_load;
      };
      
      #define        CPU_SEARCH_LOWEST        0x1
      #define        CPU_SEARCH_HIGHEST        0x2
      #define        CPU_SEARCH_BOTH                (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
      
      static __always_inline int cpu_search(const struct cpu_group *cg,
          struct cpu_search *low, struct cpu_search *high, const int match);
      int __noinline cpu_search_lowest(const struct cpu_group *cg,
          struct cpu_search *low);
      int __noinline cpu_search_highest(const struct cpu_group *cg,
          struct cpu_search *high);
      int __noinline cpu_search_both(const struct cpu_group *cg,
          struct cpu_search *low, struct cpu_search *high);
      
      /*
       * Search the tree of cpu_groups for the lowest or highest loaded cpu
       * according to the match argument.  This routine actually compares the
       * load on all paths through the tree and finds the least loaded cpu on
       * the least loaded path, which may differ from the least loaded cpu in
       * the system.  This balances work among caches and buses.
       *
       * This inline is instantiated in three forms below using constants for the
       * match argument.  It is reduced to the minimum set for each case.  It is
       * also recursive to the depth of the tree.
       */
      static __always_inline int
      cpu_search(const struct cpu_group *cg, struct cpu_search *low,
          struct cpu_search *high, const int match)
      {
              struct cpu_search lgroup;
              struct cpu_search hgroup;
              cpuset_t cpumask;
              struct cpu_group *child;
              struct tdq *tdq;
              int cpu, i, hload, lload, load, total, rnd;
      
              total = 0;
              cpumask = cg->cg_mask;
              if (match & CPU_SEARCH_LOWEST) {
                      lload = INT_MAX;
                      lgroup = *low;
              }
              if (match & CPU_SEARCH_HIGHEST) {
                      hload = INT_MIN;
                      hgroup = *high;
              }
      
              /* Iterate through the child CPU groups and then remaining CPUs. */
              for (i = cg->cg_children, cpu = mp_maxid; ; ) {
                      if (i == 0) {
      #ifdef HAVE_INLINE_FFSL
 15960                         cpu = CPU_FFS(&cpumask) - 1;
      #else
                              while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
                                      cpu--;
      #endif
                              if (cpu < 0)
                                      break;
                              child = NULL;
                      } else
                              child = &cg->cg_child[i - 1];
      
                      if (match & CPU_SEARCH_LOWEST)
                              lgroup.cs_cpu = -1;
                      if (match & CPU_SEARCH_HIGHEST)
                              hgroup.cs_cpu = -1;
                      if (child) {                        /* Handle child CPU group. */
                              CPU_ANDNOT(&cpumask, &child->cg_mask);
                              switch (match) {
                              case CPU_SEARCH_LOWEST:
                                      load = cpu_search_lowest(child, &lgroup);
                                      break;
                              case CPU_SEARCH_HIGHEST:
                                      load = cpu_search_highest(child, &hgroup);
                                      break;
                              case CPU_SEARCH_BOTH:
                                      load = cpu_search_both(child, &lgroup, &hgroup);
                                      break;
                              }
                      } else {                        /* Handle child CPU. */
                              CPU_CLR(cpu, &cpumask);
                              tdq = TDQ_CPU(cpu);
                              load = tdq->tdq_load * 256;
                              rnd = sched_random() % 32;
                              if (match & CPU_SEARCH_LOWEST) {
                                      if (cpu == low->cs_prefer)
                                              load -= 64;
                                      /* If that CPU is allowed and get data. */
 3420                                 if (tdq->tdq_lowpri > lgroup.cs_pri &&
                                          tdq->tdq_load <= lgroup.cs_limit &&
                                          CPU_ISSET(cpu, &lgroup.cs_mask)) {
 6241                                         lgroup.cs_cpu = cpu;
                                              lgroup.cs_load = load - rnd;
                                      }
                              }
                              if (match & CPU_SEARCH_HIGHEST)
 14172                                 if (tdq->tdq_load >= hgroup.cs_limit &&
    6                                     tdq->tdq_transferable &&
                                          CPU_ISSET(cpu, &hgroup.cs_mask)) {
 6008                                         hgroup.cs_cpu = cpu;
                                              hgroup.cs_load = load - rnd;
                                      }
                      }
                      total += load;
      
                      /* We have info about child item. Compare it. */
                      if (match & CPU_SEARCH_LOWEST) {
 3420                         if (lgroup.cs_cpu >= 0 &&
                                  (load < lload ||
 4957                              (load == lload && lgroup.cs_load < low->cs_load))) {
                                      lload = load;
                                      low->cs_cpu = lgroup.cs_cpu;
 6241                                 low->cs_load = lgroup.cs_load;
                              }
                      }
                      if (match & CPU_SEARCH_HIGHEST)
 14165                         if (hgroup.cs_cpu >= 0 &&
                                  (load > hload ||
                                   (load == hload && hgroup.cs_load > high->cs_load))) {
                                      hload = load;
                                      high->cs_cpu = hgroup.cs_cpu;
 5998                                 high->cs_load = hgroup.cs_load;
                              }
 15857                 if (child) {
                              i--;
                              if (i == 0 && CPU_EMPTY(&cpumask))
                                      break;
                      }
      #ifndef HAVE_INLINE_FFSL
                      else
                              cpu--;
      #endif
              }
              return (total);
      }
      
      /*
       * cpu_search instantiations must pass constants to maintain the inline
       * optimization.
       */
      int
      cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
 6242 {
 6242         return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
      }
      
      int
      cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
 14091 {
 14222         return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
      }
      
      int
      cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
          struct cpu_search *high)
      {
              return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
      }
      
      /*
       * Find the cpu with the least load via the least loaded path that has a
       * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
       * acceptable.
       */
      static inline int
      sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
          int prefer)
      {
              struct cpu_search low;
      
              low.cs_cpu = -1;
              low.cs_prefer = prefer;
              low.cs_mask = mask;
              low.cs_pri = pri;
              low.cs_limit = maxload;
              cpu_search_lowest(cg, &low);
              return low.cs_cpu;
      }
      
      /*
       * Find the cpu with the highest load via the highest loaded path.
       */
      static inline int
      sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
      {
              struct cpu_search high;
      
              high.cs_cpu = -1;
              high.cs_mask = mask;
              high.cs_limit = minload;
              cpu_search_highest(cg, &high);
              return high.cs_cpu;
      }
      
      static void
      sched_balance_group(struct cpu_group *cg)
      {
              struct tdq *tdq;
              cpuset_t hmask, lmask;
              int high, low, anylow;
      
              CPU_FILL(&hmask);
              for (;;) {
                      high = sched_highest(cg, hmask, 2);
                      /* Stop if there is no more CPU with transferrable threads. */
                      if (high == -1)
                              break;
                      CPU_CLR(high, &hmask);
                      CPU_COPY(&hmask, &lmask);
                      /* Stop if there is no more CPU left for low. */
                      if (CPU_EMPTY(&lmask))
                              break;
                      anylow = 1;
                      tdq = TDQ_CPU(high);
      nextlow:
                      low = sched_lowest(cg, lmask, -1, tdq->tdq_load - 1, high);
                      /* Stop if we looked well and found no less loaded CPU. */
                      if (anylow && low == -1)
                              break;
                      /* Go to next high if we found no less loaded CPU. */
                      if (low == -1)
                              continue;
                      /* Transfer thread from high to low. */
                      if (sched_balance_pair(tdq, TDQ_CPU(low))) {
                              /* CPU that got thread can no longer be a donor. */
                              CPU_CLR(low, &hmask);
                      } else {
                              /*
                               * If failed, then there is no threads on high
                               * that can run on this low. Drop low from low
                               * mask and look for different one.
                               */
                              CPU_CLR(low, &lmask);
                              anylow = 0;
                              goto nextlow;
                      }
              }
      }
      
      static void
      sched_balance(void)
      {
              struct tdq *tdq;
      
              balance_ticks = max(balance_interval / 2, 1) +
                  (sched_random() % balance_interval);
              tdq = TDQ_SELF();
              TDQ_UNLOCK(tdq);
              sched_balance_group(cpu_top);
              TDQ_LOCK(tdq);
      }
      
      /*
       * Lock two thread queues using their address to maintain lock order.
       */
      static void
      tdq_lock_pair(struct tdq *one, struct tdq *two)
      {
              if (one < two) {
 4147                 TDQ_LOCK(one);
                      TDQ_LOCK_FLAGS(two, MTX_DUPOK);
              } else {
 3963                 TDQ_LOCK(two);
                      TDQ_LOCK_FLAGS(one, MTX_DUPOK);
              }
      }
      
      /*
       * Unlock two thread queues.  Order is not important here.
       */
      static void
      tdq_unlock_pair(struct tdq *one, struct tdq *two)
      {
              TDQ_UNLOCK(one);
              TDQ_UNLOCK(two);
      }
      
      /*
       * Transfer load between two imbalanced thread queues.
       */
      static int
      sched_balance_pair(struct tdq *high, struct tdq *low)
      {
              struct thread *td;
              int cpu;
      
              tdq_lock_pair(high, low);
              td = NULL;
              /*
               * Transfer a thread from high to low.
               */
              if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
                  (td = tdq_move(high, low)) != NULL) {
                      /*
                       * In case the target isn't the current cpu notify it of the
                       * new load, possibly sending an IPI to force it to reschedule.
                       */
                      cpu = TDQ_ID(low);
                      if (cpu != PCPU_GET(cpuid))
                              tdq_notify(low, td);
              }
              tdq_unlock_pair(high, low);
              return (td != NULL);
      }
      
      /*
       * Move a thread from one thread queue to another.
       */
      static struct thread *
      tdq_move(struct tdq *from, struct tdq *to)
 5973 {
              struct thread *td;
              struct tdq *tdq;
              int cpu;
      
              TDQ_LOCK_ASSERT(from, MA_OWNED);
              TDQ_LOCK_ASSERT(to, MA_OWNED);
      
              tdq = from;
              cpu = TDQ_ID(to);
 5498         td = tdq_steal(tdq, cpu);
              if (td == NULL)
                      return (NULL);
      
              /*
               * Although the run queue is locked the thread may be
               * blocked.  We can not set the lock until it is unblocked.
               */
              thread_lock_block_wait(td);
              sched_rem(td);
              THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from));
 5893         td->td_lock = TDQ_LOCKPTR(to);
              td_get_sched(td)->ts_cpu = cpu;
              tdq_add(to, td, SRQ_YIELDING);
      
              return (td);
      }
      
      /*
       * This tdq has idled.  Try to steal a thread from another cpu and switch
       * to it.
       */
      static int
      tdq_idled(struct tdq *tdq)
      {
              struct cpu_group *cg;
              struct tdq *steal;
              cpuset_t mask;
              int cpu, switchcnt;
      
              if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL)
                      return (1);
              CPU_FILL(&mask);
              CPU_CLR(PCPU_GET(cpuid), &mask);
          restart:
              switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
              for (cg = tdq->tdq_cg; ; ) {
                      cpu = sched_highest(cg, mask, steal_thresh);
                      /*
                       * We were assigned a thread but not preempted.  Returning
                       * 0 here will cause our caller to switch to it.
                       */
                      if (tdq->tdq_load)
                              return (0);
                      if (cpu == -1) {
                              cg = cg->cg_parent;
                              if (cg == NULL)
                                      return (1);
                              continue;
                      }
                      steal = TDQ_CPU(cpu);
                      /*
                       * The data returned by sched_highest() is stale and
                       * the chosen CPU no longer has an eligible thread.
                       *
                       * Testing this ahead of tdq_lock_pair() only catches
                       * this situation about 20% of the time on an 8 core
                       * 16 thread Ryzen 7, but it still helps performance.
                       */
                      if (steal->tdq_load < steal_thresh ||
                          steal->tdq_transferable == 0)
                              goto restart;
                      tdq_lock_pair(tdq, steal);
                      /*
                       * We were assigned a thread while waiting for the locks.
                       * Switch to it now instead of stealing a thread.
                       */
                      if (tdq->tdq_load)
                              break;
                      /*
                       * The data returned by sched_highest() is stale and
                       * the chosen CPU no longer has an eligible thread, or
                       * we were preempted and the CPU loading info may be out
                       * of date.  The latter is rare.  In either case restart
                       * the search.
                       */
                      if (steal->tdq_load < steal_thresh ||
                          steal->tdq_transferable == 0 ||
                          switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) {
                              tdq_unlock_pair(tdq, steal);
                              goto restart;
                      }
                      /*
                       * Steal the thread and switch to it.
                       */
                      if (tdq_move(steal, tdq) != NULL)
                              break;
                      /*
                       * We failed to acquire a thread even though it looked
                       * like one was available.  This could be due to affinity
                       * restrictions or for other reasons.  Loop again after
                       * removing this CPU from the set.  The restart logic
                       * above does not restore this CPU to the set due to the
                       * likelyhood of failing here again.
                       */
                      CPU_CLR(cpu, &mask);
                      tdq_unlock_pair(tdq, steal);
              }
              TDQ_UNLOCK(steal);
              mi_switch(SW_VOL | SWT_IDLE);
              return (0);
      }
      
      /*
       * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
       */
      static void
      tdq_notify(struct tdq *tdq, struct thread *td)
 4411 {
              struct thread *ctd;
              int pri;
              int cpu;
      
   68         if (tdq->tdq_owepreempt)
                      return;
              cpu = td_get_sched(td)->ts_cpu;
              pri = td->td_priority;
              ctd = pcpu_find(cpu)->pc_curthread;
 1946         if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
                      return;
      
              /*
               * Make sure that our caller's earlier update to tdq_load is
               * globally visible before we read tdq_cpu_idle.  Idle thread
               * accesses both of them without locks, and the order is important.
               */
              atomic_thread_fence_seq_cst();
      
 1246         if (TD_IS_IDLETHREAD(ctd)) {
                      /*
                       * If the MD code has an idle wakeup routine try that before
                       * falling back to IPI.
                       */
 2680                 if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
                              return;
              }
      
              /*
               * The run queues have been updated, so any switch on the remote CPU
               * will satisfy the preemption request.
               */
              tdq->tdq_owepreempt = 1;
              ipi_cpu(cpu, IPI_PREEMPT);
      }
      
      /*
       * Steals load from a timeshare queue.  Honors the rotating queue head
       * index.
       */
      static struct thread *
      runq_steal_from(struct runq *rq, int cpu, u_char start)
      {
              struct rqbits *rqb;
              struct rqhead *rqh;
              struct thread *td, *first;
              int bit;
              int i;
      
              rqb = &rq->rq_status;
              bit = start & (RQB_BPW -1);
              first = NULL;
      again:
              for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
                      if (rqb->rqb_bits[i] == 0)
                              continue;
 5774                 if (bit == 0)
 5498                         bit = RQB_FFS(rqb->rqb_bits[i]);
 5716                 for (; bit < RQB_BPW; bit++) {
 5663                         if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
                                      continue;
                              rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
 5810                         TAILQ_FOREACH(td, rqh, td_runq) {
 5845                                 if (first && THREAD_CAN_MIGRATE(td) &&
                                          THREAD_CAN_SCHED(td, cpu))
                                              return (td);
                                      first = td;
                              }
                      }
              }
 5439         if (start != 0) {
                      start = 0;
                      goto again;
              }
      
 1162         if (first && THREAD_CAN_MIGRATE(first) &&
                  THREAD_CAN_SCHED(first, cpu))
                      return (first);
              return (NULL);
      }
      
      /*
       * Steals load from a standard linear queue.
       */
      static struct thread *
      runq_steal(struct runq *rq, int cpu)
      {
              struct rqhead *rqh;
              struct rqbits *rqb;
              struct thread *td;
              int word;
              int bit;
      
              rqb = &rq->rq_status;
              for (word = 0; word < RQB_LEN; word++) {
 5832                 if (rqb->rqb_bits[word] == 0)
                              continue;
  440                 for (bit = 0; bit < RQB_BPW; bit++) {
  440                         if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
                                      continue;
                              rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
    9                         TAILQ_FOREACH(td, rqh, td_runq)
  440                                 if (THREAD_CAN_MIGRATE(td) &&
                                          THREAD_CAN_SCHED(td, cpu))
                                              return (td);
                      }
              }
              return (NULL);
      }
      
      /*
       * Attempt to steal a thread in priority order from a thread queue.
       */
      static struct thread *
      tdq_steal(struct tdq *tdq, int cpu)
      {
              struct thread *td;
      
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
              if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
                      return (td);
 5498         if ((td = runq_steal_from(&tdq->tdq_timeshare,
                  cpu, tdq->tdq_ridx)) != NULL)
                      return (td);
              return (runq_steal(&tdq->tdq_idle, cpu));
      }
      
      /*
       * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
       * current lock and returns with the assigned queue locked.
       */
      static inline struct tdq *
      sched_setcpu(struct thread *td, int cpu, int flags)
      {
      
              struct tdq *tdq;
              struct mtx *mtx;
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              tdq = TDQ_CPU(cpu);
              td_get_sched(td)->ts_cpu = cpu;
              /*
               * If the lock matches just return the queue.
               */
              if (td->td_lock == TDQ_LOCKPTR(tdq)) {
 2843                 KASSERT((flags & SRQ_HOLD) == 0,
                          ("sched_setcpu: Invalid lock for SRQ_HOLD"));
                      return (tdq);
              }
      
              /*
               * The hard case, migration, we need to block the thread first to
               * prevent order reversals with other cpus locks.
               */
              spinlock_enter();
              mtx = thread_lock_block(td);
 3269         if ((flags & SRQ_HOLD) == 0)
 3346                 mtx_unlock_spin(mtx);
              TDQ_LOCK(tdq);
              thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
              spinlock_exit();
              return (tdq);
      }
      
      SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
      SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
      SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
      SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
      SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
      SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
      
      static int
      sched_pickcpu(struct thread *td, int flags)
 6295 {
              struct cpu_group *cg, *ccg;
              struct td_sched *ts;
              struct tdq *tdq;
              cpuset_t mask;
              int cpu, pri, self, intr;
      
              self = PCPU_GET(cpuid);
              ts = td_get_sched(td);
              KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on "
                  "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name));
              if (smp_started == 0)
                      return (self);
              /*
               * Don't migrate a running thread from sched_switch().
               */
  894         if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
                      return (ts->ts_cpu);
              /*
               * Prefer to run interrupt threads on the processors that generate
               * the interrupt.
               */
 6242         if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
                  curthread->td_intr_nesting_level) {
                      tdq = TDQ_SELF();
                      if (tdq->tdq_lowpri >= PRI_MIN_IDLE) {
                              SCHED_STAT_INC(pickcpu_idle_affinity);
                              return (self);
                      }
                      ts->ts_cpu = self;
                      intr = 1;
                      cg = tdq->tdq_cg;
                      goto llc;
              } else {
                      intr = 0;
                      tdq = TDQ_CPU(ts->ts_cpu);
                      cg = tdq->tdq_cg;
              }
              /*
               * If the thread can run on the last cpu and the affinity has not
               * expired and it is idle, run it there.
               */
    7         if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
 6230             tdq->tdq_lowpri >= PRI_MIN_IDLE &&
                  SCHED_AFFINITY(ts, CG_SHARE_L2)) {
                      if (cg->cg_flags & CG_FLAG_THREAD) {
                              /* Check all SMT threads for being idle. */
  555                         for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) {
  554                                 if (CPU_ISSET(cpu, &cg->cg_mask) &&
                                          TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
                                              break;
  208                                 if (cpu >= mp_maxid) {
                                              SCHED_STAT_INC(pickcpu_idle_affinity);
                                              return (ts->ts_cpu);
                                      }
                              }
                      } else {
                              SCHED_STAT_INC(pickcpu_idle_affinity);
                              return (ts->ts_cpu);
                      }
              }
      llc:
              /*
               * Search for the last level cache CPU group in the tree.
               * Skip SMT, identical groups and caches with expired affinity.
               * Interrupt threads affinity is explicit and never expires.
               */
 6242         for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
 6242                 if (cg->cg_flags & CG_FLAG_THREAD)
                              continue;
                      if (cg->cg_children == 1 || cg->cg_count == 1)
                              continue;
                      if (cg->cg_level == CG_SHARE_NONE ||
                          (!intr && !SCHED_AFFINITY(ts, cg->cg_level)))
                              continue;
                      ccg = cg;
              }
              /* Found LLC shared by all CPUs, so do a global search. */
              if (ccg == cpu_top)
                      ccg = NULL;
              cpu = -1;
              mask = td->td_cpuset->cs_mask;
              pri = td->td_priority;
              /*
               * Try hard to keep interrupts within found LLC.  Search the LLC for
               * the least loaded CPU we can run now.  For NUMA systems it should
               * be within target domain, and it also reduces scheduling overhead.
               */
              if (ccg != NULL && intr) {
                      cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu);
                      if (cpu >= 0)
                              SCHED_STAT_INC(pickcpu_intrbind);
              } else
              /* Search the LLC for the least loaded idle CPU we can run now. */
 6242         if (ccg != NULL) {
                      cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE),
                          INT_MAX, ts->ts_cpu);
                      if (cpu >= 0)
                              SCHED_STAT_INC(pickcpu_affinity);
              }
              /* Search globally for the least loaded CPU we can run now. */
              if (cpu < 0) {
                      cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
                      if (cpu >= 0)
                              SCHED_STAT_INC(pickcpu_lowest);
              }
              /* Search globally for the least loaded CPU. */
 6158         if (cpu < 0) {
                      cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
                      if (cpu >= 0)
                              SCHED_STAT_INC(pickcpu_lowest);
              }
 1888         KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu."));
              KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu));
              /*
               * Compare the lowest loaded cpu to current cpu.
               */
              tdq = TDQ_CPU(cpu);
 2318         if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri &&
 2149             tdq->tdq_lowpri < PRI_MIN_IDLE &&
 5168             TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) {
                      SCHED_STAT_INC(pickcpu_local);
                      cpu = self;
              }
              if (cpu != ts->ts_cpu)
                      SCHED_STAT_INC(pickcpu_migration);
              return (cpu);
      }
      #endif
      
      /*
       * Pick the highest priority task we have and return it.
       */
      static struct thread *
      tdq_choose(struct tdq *tdq)
 23706 {
              struct thread *td;
      
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
              td = runq_choose(&tdq->tdq_realtime);
 6906         if (td != NULL)
                      return (td);
              td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
              if (td != NULL) {
 22749                 KASSERT(td->td_priority >= PRI_MIN_BATCH,
                          ("tdq_choose: Invalid priority on timeshare queue %d",
                          td->td_priority));
                      return (td);
              }
              td = runq_choose(&tdq->tdq_idle);
 15018         if (td != NULL) {
                      KASSERT(td->td_priority >= PRI_MIN_IDLE,
                          ("tdq_choose: Invalid priority on idle queue %d",
                          td->td_priority));
                      return (td);
              }
      
              return (NULL);
      }
      
      /*
       * Initialize a thread queue.
       */
      static void
      tdq_setup(struct tdq *tdq, int id)
      {
      
              if (bootverbose)
                      printf("ULE: setup cpu %d\n", id);
              runq_init(&tdq->tdq_realtime);
              runq_init(&tdq->tdq_timeshare);
              runq_init(&tdq->tdq_idle);
              tdq->tdq_id = id;
              snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
                  "sched lock %d", (int)TDQ_ID(tdq));
              mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN);
      #ifdef KTR
              snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
                  "CPU %d load", (int)TDQ_ID(tdq));
      #endif
      }
      
      #ifdef SMP
      static void
      sched_setup_smp(void)
      {
              struct tdq *tdq;
              int i;
      
              cpu_top = smp_topo();
              CPU_FOREACH(i) {
                      tdq = DPCPU_ID_PTR(i, tdq);
                      tdq_setup(tdq, i);
                      tdq->tdq_cg = smp_topo_find(cpu_top, i);
                      if (tdq->tdq_cg == NULL)
                              panic("Can't find cpu group for %d\n", i);
              }
              PCPU_SET(sched, DPCPU_PTR(tdq));
              balance_tdq = TDQ_SELF();
      }
      #endif
      
      /*
       * Setup the thread queues and initialize the topology based on MD
       * information.
       */
      static void
      sched_setup(void *dummy)
      {
              struct tdq *tdq;
      
      #ifdef SMP
              sched_setup_smp();
      #else
              tdq_setup(TDQ_SELF(), 0);
      #endif
              tdq = TDQ_SELF();
      
              /* Add thread0's load since it's running. */
              TDQ_LOCK(tdq);
              thread0.td_lock = TDQ_LOCKPTR(tdq);
              tdq_load_add(tdq, &thread0);
              tdq->tdq_lowpri = thread0.td_priority;
              TDQ_UNLOCK(tdq);
      }
      
      /*
       * This routine determines time constants after stathz and hz are setup.
       */
      /* ARGSUSED */
      static void
      sched_initticks(void *dummy)
      {
              int incr;
      
              realstathz = stathz ? stathz : hz;
              sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
              sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
              hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
                  realstathz);
      
              /*
               * tickincr is shifted out by 10 to avoid rounding errors due to
               * hz not being evenly divisible by stathz on all platforms.
               */
              incr = (hz << SCHED_TICK_SHIFT) / realstathz;
              /*
               * This does not work for values of stathz that are more than
               * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
               */
              if (incr == 0)
                      incr = 1;
              tickincr = incr;
      #ifdef SMP
              /*
               * Set the default balance interval now that we know
               * what realstathz is.
               */
              balance_interval = realstathz;
              balance_ticks = balance_interval;
              affinity = SCHED_AFFINITY_DEFAULT;
      #endif
              if (sched_idlespinthresh < 0)
                      sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
      }
      
      /*
       * This is the core of the interactivity algorithm.  Determines a score based
       * on past behavior.  It is the ratio of sleep time to run time scaled to
       * a [0, 100] integer.  This is the voluntary sleep time of a process, which
       * differs from the cpu usage because it does not account for time spent
       * waiting on a run-queue.  Would be prettier if we had floating point.
       *
       * When a thread's sleep time is greater than its run time the
       * calculation is:
       *
       *                           scaling factor 
       * interactivity score =  ---------------------
       *                        sleep time / run time
       *
       *
       * When a thread's run time is greater than its sleep time the
       * calculation is:
       *
       *                           scaling factor 
       * interactivity score =  ---------------------    + scaling factor
       *                        run time / sleep time
       */
      static int
      sched_interact_score(struct thread *td)
      {
              struct td_sched *ts;
              int div;
      
              ts = td_get_sched(td);
              /*
               * The score is only needed if this is likely to be an interactive
               * task.  Don't go through the expense of computing it if there's
               * no chance.
               */
 4454         if (sched_interact <= SCHED_INTERACT_HALF &&
                      ts->ts_runtime >= ts->ts_slptime)
                              return (SCHED_INTERACT_HALF);
      
              if (ts->ts_runtime > ts->ts_slptime) {
                      div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
                      return (SCHED_INTERACT_HALF +
                          (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
              }
              if (ts->ts_slptime > ts->ts_runtime) {
 1977                 div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
                      return (ts->ts_runtime / div);
              }
              /* runtime == slptime */
              if (ts->ts_runtime)
                      return (SCHED_INTERACT_HALF);
      
              /*
               * This can happen if slptime and runtime are 0.
               */
              return (0);
      
      }
      
      /*
       * Scale the scheduling priority according to the "interactivity" of this
       * process.
       */
      static void
      sched_priority(struct thread *td)
 5070 {
              int score;
              int pri;
      
              if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
                      return;
              /*
               * If the score is interactive we place the thread in the realtime
               * queue with a priority that is less than kernel and interrupt
               * priorities.  These threads are not subject to nice restrictions.
               *
               * Scores greater than this are placed on the normal timeshare queue
               * where the priority is partially decided by the most recent cpu
               * utilization and the rest is decided by nice value.
               *
               * The nice value of the process has a linear effect on the calculated
               * score.  Negative nice values make it easier for a thread to be
               * considered interactive.
               */
              score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
              if (score < sched_interact) {
                      pri = PRI_MIN_INTERACT;
                      pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
                          sched_interact) * score;
 1973                 KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
                          ("sched_priority: invalid interactive priority %d score %d",
                          pri, score));
              } else {
                      pri = SCHED_PRI_MIN;
                      if (td_get_sched(td)->ts_ticks)
 4457                         pri += min(SCHED_PRI_TICKS(td_get_sched(td)),
                                  SCHED_PRI_RANGE - 1);
                      pri += SCHED_PRI_NICE(td->td_proc->p_nice);
 4457                 KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
                          ("sched_priority: invalid priority %d: nice %d, " 
                          "ticks %d ftick %d ltick %d tick pri %d",
                          pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks,
                          td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick,
                          SCHED_PRI_TICKS(td_get_sched(td))));
              }
 5070         sched_user_prio(td, pri);
      
              return;
      }
      
      /*
       * This routine enforces a maximum limit on the amount of scheduling history
       * kept.  It is called after either the slptime or runtime is adjusted.  This
       * function is ugly due to integer math.
       */
      static void
      sched_interact_update(struct thread *td)
      {
              struct td_sched *ts;
              u_int sum;
      
              ts = td_get_sched(td);
              sum = ts->ts_runtime + ts->ts_slptime;
 1748         if (sum < SCHED_SLP_RUN_MAX)
                      return;
              /*
               * This only happens from two places:
               * 1) We have added an unusual amount of run time from fork_exit.
               * 2) We have added an unusual amount of sleep time from sched_sleep().
               */
              if (sum > SCHED_SLP_RUN_MAX * 2) {
                      if (ts->ts_runtime > ts->ts_slptime) {
                              ts->ts_runtime = SCHED_SLP_RUN_MAX;
                              ts->ts_slptime = 1;
                      } else {
   16                         ts->ts_slptime = SCHED_SLP_RUN_MAX;
                              ts->ts_runtime = 1;
                      }
                      return;
              }
              /*
               * If we have exceeded by more than 1/5th then the algorithm below
               * will not bring us back into range.  Dividing by two here forces
               * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
               */
              if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
    9                 ts->ts_runtime /= 2;
                      ts->ts_slptime /= 2;
                      return;
              }
   66         ts->ts_runtime = (ts->ts_runtime / 5) * 4;
              ts->ts_slptime = (ts->ts_slptime / 5) * 4;
      }
      
      /*
       * Scale back the interactivity history when a child thread is created.  The
       * history is inherited from the parent but the thread may behave totally
       * differently.  For example, a shell spawning a compiler process.  We want
       * to learn that the compiler is behaving badly very quickly.
       */
      static void
      sched_interact_fork(struct thread *td)
      {
              struct td_sched *ts;
              int ratio;
              int sum;
      
              ts = td_get_sched(td);
              sum = ts->ts_runtime + ts->ts_slptime;
              if (sum > SCHED_SLP_RUN_FORK) {
                      ratio = sum / SCHED_SLP_RUN_FORK;
                      ts->ts_runtime /= ratio;
                      ts->ts_slptime /= ratio;
              }
      }
      
      /*
       * Called from proc0_init() to setup the scheduler fields.
       */
      void
      schedinit(void)
      {
              struct td_sched *ts0;
      
              /*
               * Set up the scheduler specific parts of thread0.
               */
              ts0 = td_get_sched(&thread0);
              ts0->ts_ltick = ticks;
              ts0->ts_ftick = ticks;
              ts0->ts_slice = 0;
              ts0->ts_cpu = curcpu;        /* set valid CPU number */
      }
      
      /*
       * This is only somewhat accurate since given many processes of the same
       * priority they will switch when their slices run out, which will be
       * at most sched_slice stathz ticks.
       */
      int
      sched_rr_interval(void)
      {
      
              /* Convert sched_slice from stathz to hz. */
              return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
      }
      
      /*
       * Update the percent cpu tracking information when it is requested or
       * the total history exceeds the maximum.  We keep a sliding history of
       * tick counts that slowly decays.  This is less precise than the 4BSD
       * mechanism since it happens with less regular and frequent events.
       */
      static void
      sched_pctcpu_update(struct td_sched *ts, int run)
      {
              int t = ticks;
      
              /*
               * The signed difference may be negative if the thread hasn't run for
               * over half of the ticks rollover period.
               */
              if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) {
   15                 ts->ts_ticks = 0;
                      ts->ts_ftick = t - SCHED_TICK_TARG;
 23819         } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
 1208                 ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
                          (ts->ts_ltick - (t - SCHED_TICK_TARG));
                      ts->ts_ftick = t - SCHED_TICK_TARG;
              }
              if (run)
                      ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
              ts->ts_ltick = t;
      }
      
      /*
       * Adjust the priority of a thread.  Move it to the appropriate run-queue
       * if necessary.  This is the back-end for several priority related
       * functions.
       */
      static void
      sched_thread_priority(struct thread *td, u_char prio)
 23547 {
              struct td_sched *ts;
              struct tdq *tdq;
              int oldpri;
      
              KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
                  "prio:%d", td->td_priority, "new prio:%d", prio,
                  KTR_ATTR_LINKED, sched_tdname(curthread));
 23522         SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 23517         if (td != curthread && prio < td->td_priority) {
                      KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
                          "lend prio", "prio:%d", td->td_priority, "new prio:%d",
                          prio, KTR_ATTR_LINKED, sched_tdname(td));
   63                 SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
                          curthread);
              } 
              ts = td_get_sched(td);
              THREAD_LOCK_ASSERT(td, MA_OWNED);
 5487         if (td->td_priority == prio)
                      return;
              /*
               * If the priority has been elevated due to priority
               * propagation, we may have to move ourselves to a new
               * queue.  This could be optimized to not re-add in some
               * cases.
               */
              if (TD_ON_RUNQ(td) && prio < td->td_priority) {
   49                 sched_rem(td);
                      td->td_priority = prio;
                      sched_add(td, SRQ_BORROWING | SRQ_HOLDTD);
                      return;
              }
              /*
               * If the thread is currently running we may have to adjust the lowpri
               * information so other cpus are aware of our current priority.
               */
              if (TD_IS_RUNNING(td)) {
                      tdq = TDQ_CPU(ts->ts_cpu);
                      oldpri = td->td_priority;
                      td->td_priority = prio;
                      if (prio < tdq->tdq_lowpri)
 19248                         tdq->tdq_lowpri = prio;
 11990                 else if (tdq->tdq_lowpri == oldpri)
                              tdq_setlowpri(tdq, td);
                      return;
              }
 2718         td->td_priority = prio;
      }
      
      /*
       * Update a thread's priority when it is lent another thread's
       * priority.
       */
      void
      sched_lend_prio(struct thread *td, u_char prio)
   49 {
      
              td->td_flags |= TDF_BORROWING;
              sched_thread_priority(td, prio);
      }
      
      /*
       * Restore a thread's priority when priority propagation is
       * over.  The prio argument is the minimum priority the thread
       * needs to have to satisfy other possible priority lending
       * requests.  If the thread's regular priority is less
       * important than prio, the thread will keep a priority boost
       * of prio.
       */
      void
      sched_unlend_prio(struct thread *td, u_char prio)
 1501 {
              u_char base_pri;
      
              if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
                  td->td_base_pri <= PRI_MAX_TIMESHARE)
                      base_pri = td->td_user_pri;
              else
                      base_pri = td->td_base_pri;
              if (prio >= base_pri) {
 1501                 td->td_flags &= ~TDF_BORROWING;
                      sched_thread_priority(td, base_pri);
              } else
   10                 sched_lend_prio(td, prio);
      }
      
      /*
       * Standard entry for setting the priority to an absolute value.
       */
      void
      sched_prio(struct thread *td, u_char prio)
 6618 {
              u_char oldprio;
      
              /* First, update the base priority. */
              td->td_base_pri = prio;
      
              /*
               * If the thread is borrowing another thread's priority, don't
               * ever lower the priority.
               */
  378         if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
                      return;
      
              /* Change the real priority. */
              oldprio = td->td_priority;
              sched_thread_priority(td, prio);
      
              /*
               * If the thread is on a turnstile, then let the turnstile update
               * its state.
               */
 23142         if (TD_ON_LOCK(td) && oldprio != prio)
                      turnstile_adjust(td, oldprio);
      }
      
      /*
       * Set the base user priority, does not effect current running priority.
       */
      void
      sched_user_prio(struct thread *td, u_char prio)
      {
      
              td->td_base_user_pri = prio;
              if (td->td_lend_user_pri <= prio)
                      return;
 5070         td->td_user_pri = prio;
      }
      
      void
      sched_lend_user_prio(struct thread *td, u_char prio)
      {
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              td->td_lend_user_pri = prio;
              td->td_user_pri = min(prio, td->td_base_user_pri);
              if (td->td_priority > td->td_user_pri)
                      sched_prio(td, td->td_user_pri);
              else if (td->td_priority != td->td_user_pri)
                      td->td_flags |= TDF_NEEDRESCHED;
      }
      
      /*
       * Like the above but first check if there is anything to do.
       */
      void
      sched_lend_user_prio_cond(struct thread *td, u_char prio)
      {
      
              if (td->td_lend_user_pri != prio)
                      goto lend;
              if (td->td_user_pri != min(prio, td->td_base_user_pri))
                      goto lend;
              if (td->td_priority >= td->td_user_pri)
                      goto lend;
              return;
      
      lend:
              thread_lock(td);
              sched_lend_user_prio(td, prio);
              thread_unlock(td);
      }
      
      #ifdef SMP
      /*
       * This tdq is about to idle.  Try to steal a thread from another CPU before
       * choosing the idle thread.
       */
      static void
      tdq_trysteal(struct tdq *tdq)
      {
              struct cpu_group *cg;
              struct tdq *steal;
              cpuset_t mask;
              int cpu, i;
      
              if (smp_started == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL)
                      return;
              CPU_FILL(&mask);
              CPU_CLR(PCPU_GET(cpuid), &mask);
              /* We don't want to be preempted while we're iterating. */
              spinlock_enter();
              TDQ_UNLOCK(tdq);
              for (i = 1, cg = tdq->tdq_cg; ; ) {
                      cpu = sched_highest(cg, mask, steal_thresh);
                      /*
                       * If a thread was added while interrupts were disabled don't
                       * steal one here.
                       */
                      if (tdq->tdq_load > 0) {
    3                         TDQ_LOCK(tdq);
                              break;
                      }
                      if (cpu == -1) {
                              i++;
                              cg = cg->cg_parent;
                              if (cg == NULL || i > trysteal_limit) {
 12967                                 TDQ_LOCK(tdq);
                                      break;
                              }
                              continue;
                      }
                      steal = TDQ_CPU(cpu);
                      /*
                       * The data returned by sched_highest() is stale and
                       * the chosen CPU no longer has an eligible thread.
                       */
   36                 if (steal->tdq_load < steal_thresh ||
                          steal->tdq_transferable == 0)
                              continue;
 5990                 tdq_lock_pair(tdq, steal);
                      /*
                       * If we get to this point, unconditonally exit the loop
                       * to bound the time spent in the critcal section.
                       *
                       * If a thread was added while interrupts were disabled don't
                       * steal one here.
                       */
                      if (tdq->tdq_load > 0) {
  170                         TDQ_UNLOCK(steal);
                              break;
                      }
                      /*
                       * The data returned by sched_highest() is stale and
                       * the chosen CPU no longer has an eligible thread.
                       */
   76                 if (steal->tdq_load < steal_thresh ||
                          steal->tdq_transferable == 0) {
                              TDQ_UNLOCK(steal);
                              break;
                      }
                      /*
                       * If we fail to acquire one due to affinity restrictions,
                       * bail out and let the idle thread to a more complete search
                       * outside of a critical section.
                       */
                      if (tdq_move(steal, tdq) == NULL) {
                              TDQ_UNLOCK(steal);
                              break;
                      }
 5887                 TDQ_UNLOCK(steal);
                      break;
              }
              spinlock_exit();
      }
      #endif
      
      /*
       * Handle migration from sched_switch().  This happens only for
       * cpu binding.
       */
      static struct mtx *
      sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
      {
              struct tdq *tdn;
      
              KASSERT(THREAD_CAN_MIGRATE(td) ||
                  (td_get_sched(td)->ts_flags & TSF_BOUND) != 0,
                  ("Thread %p shouldn't migrate", td));
              KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: "
                  "thread %s queued on absent CPU %d.", td->td_name,
                  td_get_sched(td)->ts_cpu));
              tdn = TDQ_CPU(td_get_sched(td)->ts_cpu);
      #ifdef SMP
              tdq_load_rem(tdq, td);
              /*
               * Do the lock dance required to avoid LOR.  We have an 
               * extra spinlock nesting from sched_switch() which will
               * prevent preemption while we're holding neither run-queue lock.
               */
              TDQ_UNLOCK(tdq);
              TDQ_LOCK(tdn);
              tdq_add(tdn, td, flags);
              tdq_notify(tdn, td);
              TDQ_UNLOCK(tdn);
              TDQ_LOCK(tdq);
      #endif
              return (TDQ_LOCKPTR(tdn));
      }
      
      /*
       * thread_lock_unblock() that does not assume td_lock is blocked.
       */
      static inline void
      thread_unblock_switch(struct thread *td, struct mtx *mtx)
      {
              atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
                  (uintptr_t)mtx);
      }
      
      /*
       * Switch threads.  This function has to handle threads coming in while
       * blocked for some reason, running, or idle.  It also must deal with
       * migrating a thread from one queue to another as running threads may
       * be assigned elsewhere via binding.
       */
      void
      sched_switch(struct thread *td, int flags)
 23767 {
              struct thread *newtd;
              struct tdq *tdq;
              struct td_sched *ts;
              struct mtx *mtx;
              int srqflag;
              int cpuid, preempted;
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
      
              cpuid = PCPU_GET(cpuid);
              tdq = TDQ_SELF();
              ts = td_get_sched(td);
 23741         sched_pctcpu_update(ts, 1);
              ts->ts_rltick = ticks;
              td->td_lastcpu = td->td_oncpu;
              preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
                  (flags & SW_PREEMPT) != 0;
              td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
              td->td_owepreempt = 0;
              tdq->tdq_owepreempt = 0;
              if (!TD_IS_IDLETHREAD(td))
 23740                 tdq->tdq_switchcnt++;
      
              /*
               * Always block the thread lock so we can drop the tdq lock early.
               */
              mtx = thread_lock_block(td);
              spinlock_enter();
              if (TD_IS_IDLETHREAD(td)) {
                      MPASS(mtx == TDQ_LOCKPTR(tdq));
                      TD_SET_CAN_RUN(td);
              } else if (TD_IS_RUNNING(td)) {
                      MPASS(mtx == TDQ_LOCKPTR(tdq));
                      srqflag = preempted ?
                          SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
                          SRQ_OURSELF|SRQ_YIELDING;
      #ifdef SMP
 7734                 if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
                              ts->ts_cpu = sched_pickcpu(td, 0);
      #endif
                      if (ts->ts_cpu == cpuid)
 8645                         tdq_runq_add(tdq, td, srqflag);
                      else
                              mtx = sched_switch_migrate(tdq, td, srqflag);
              } else {
                      /* This thread must be going to sleep. */
 2688                 if (mtx != TDQ_LOCKPTR(tdq)) {
 19510                         mtx_unlock_spin(mtx);
                              TDQ_LOCK(tdq);
                      }
                      tdq_load_rem(tdq, td);
      #ifdef SMP
 18385                 if (tdq->tdq_load == 0)
 5990                         tdq_trysteal(tdq);
      #endif
              }
      
      #if (KTR_COMPILE & KTR_SCHED) != 0
              if (TD_IS_IDLETHREAD(td))
                      KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
                          "prio:%d", td->td_priority);
              else
                      KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
                          "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
                          "lockname:\"%s\"", td->td_lockname);
      #endif
      
              /*
               * We enter here with the thread blocked and assigned to the
               * appropriate cpu run-queue or sleep-queue and with the current
               * thread-queue locked.
               */
              TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
              newtd = choosethread();
 23539         sched_pctcpu_update(td_get_sched(newtd), 0);
              TDQ_UNLOCK(tdq);
      
              /*
               * Call the MD code to switch contexts if necessary.
               */
              if (td != newtd) {
      #ifdef        HWPMC_HOOKS
 23512                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
                              PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
      #endif
 23513                 SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
      
      #ifdef KDTRACE_HOOKS
                      /*
                       * If DTrace has set the active vtime enum to anything
                       * other than INACTIVE (0), then it should have set the
                       * function to call.
                       */
 23513                 if (dtrace_vtime_active)
                              (*dtrace_vtime_switch_func)(newtd);
      #endif
                      td->td_oncpu = NOCPU;
                      cpu_switch(td, newtd, mtx);
                      cpuid = td->td_oncpu = PCPU_GET(cpuid);
      
 13736                 SDT_PROBE0(sched, , , on__cpu);
      #ifdef        HWPMC_HOOKS
 13734                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
                              PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
      #endif
              } else {
                      thread_unblock_switch(td, mtx);
  700                 SDT_PROBE0(sched, , , remain__cpu);
              }
              KASSERT(curthread->td_md.md_spinlock_count == 1,
                  ("invalid count %d", curthread->td_md.md_spinlock_count));
      
              KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
                  "prio:%d", td->td_priority);
 13770 }
      
      /*
       * Adjust thread priorities as a result of a nice request.
       */
      void
      sched_nice(struct proc *p, int nice)
      {
              struct thread *td;
      
              PROC_LOCK_ASSERT(p, MA_OWNED);
      
              p->p_nice = nice;
              FOREACH_THREAD_IN_PROC(p, td) {
                      thread_lock(td);
                      sched_priority(td);
                      sched_prio(td, td->td_base_user_pri);
                      thread_unlock(td);
              }
      }
      
      /*
       * Record the sleep time for the interactivity scorer.
       */
      void
      sched_sleep(struct thread *td, int prio)
 19668 {
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
      
              td->td_slptick = ticks;
 6282         if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
 14280                 td->td_flags |= TDF_CANSWAP;
              if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
                      return;
              if (static_boost == 1 && prio)
                      sched_prio(td, prio);
 2275         else if (static_boost && td->td_priority > static_boost)
                      sched_prio(td, static_boost);
      }
      
      /*
       * Schedule a thread to resume execution and record how long it voluntarily
       * slept.  We also update the pctcpu, interactivity, and priority.
       *
       * Requires the thread lock on entry, drops on exit.
       */
      void
      sched_wakeup(struct thread *td, int srqflags)
 4888 {
              struct td_sched *ts;
              int slptick;
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              ts = td_get_sched(td);
              td->td_flags &= ~TDF_CANSWAP;
      
              /*
               * If we slept for more than a tick update our interactivity and
               * priority.
               */
              slptick = td->td_slptick;
              td->td_slptick = 0;
 4528         if (slptick && slptick != ticks) {
                      ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
 1786                 sched_interact_update(td);
 1786                 sched_pctcpu_update(ts, 0);
              }
              /*
               * Reset the slice value since we slept and advanced the round-robin.
               */
              ts->ts_slice = 0;
              sched_add(td, SRQ_BORING | srqflags);
      }
      
      /*
       * Penalize the parent for creating a new child and initialize the child's
       * priority.
       */
      void
      sched_fork(struct thread *td, struct thread *child)
      {
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              sched_pctcpu_update(td_get_sched(td), 1);
              sched_fork_thread(td, child);
              /*
               * Penalize the parent and child for forking.
               */
              sched_interact_fork(child);
              sched_priority(child);
              td_get_sched(td)->ts_runtime += tickincr;
              sched_interact_update(td);
              sched_priority(td);
      }
      
      /*
       * Fork a new thread, may be within the same process.
       */
      void
      sched_fork_thread(struct thread *td, struct thread *child)
      {
              struct td_sched *ts;
              struct td_sched *ts2;
              struct tdq *tdq;
      
              tdq = TDQ_SELF();
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              /*
               * Initialize child.
               */
              ts = td_get_sched(td);
              ts2 = td_get_sched(child);
              child->td_oncpu = NOCPU;
              child->td_lastcpu = NOCPU;
              child->td_lock = TDQ_LOCKPTR(tdq);
              child->td_cpuset = cpuset_ref(td->td_cpuset);
              child->td_domain.dr_policy = td->td_cpuset->cs_domain;
              ts2->ts_cpu = ts->ts_cpu;
              ts2->ts_flags = 0;
              /*
               * Grab our parents cpu estimation information.
               */
              ts2->ts_ticks = ts->ts_ticks;
              ts2->ts_ltick = ts->ts_ltick;
              ts2->ts_ftick = ts->ts_ftick;
              /*
               * Do not inherit any borrowed priority from the parent.
               */
              child->td_priority = child->td_base_pri;
              /*
               * And update interactivity score.
               */
              ts2->ts_slptime = ts->ts_slptime;
              ts2->ts_runtime = ts->ts_runtime;
              /* Attempt to quickly learn interactivity. */
              ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
      #ifdef KTR
              bzero(ts2->ts_name, sizeof(ts2->ts_name));
      #endif
      }
      
      /*
       * Adjust the priority class of a thread.
       */
      void
      sched_class(struct thread *td, int class)
      {
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              if (td->td_pri_class == class)
                      return;
              td->td_pri_class = class;
      }
      
      /*
       * Return some of the child's priority and interactivity to the parent.
       */
      void
      sched_exit(struct proc *p, struct thread *child)
      {
              struct thread *td;
      
              KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
                  "prio:%d", child->td_priority);
              PROC_LOCK_ASSERT(p, MA_OWNED);
              td = FIRST_THREAD_IN_PROC(p);
              sched_exit_thread(td, child);
      }
      
      /*
       * Penalize another thread for the time spent on this one.  This helps to
       * worsen the priority and interactivity of processes which schedule batch
       * jobs such as make.  This has little effect on the make process itself but
       * causes new processes spawned by it to receive worse scores immediately.
       */
      void
      sched_exit_thread(struct thread *td, struct thread *child)
      {
      
              KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
                  "prio:%d", child->td_priority);
              /*
               * Give the child's runtime to the parent without returning the
               * sleep time as a penalty to the parent.  This causes shells that
               * launch expensive things to mark their children as expensive.
               */
              thread_lock(td);
              td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime;
              sched_interact_update(td);
              sched_priority(td);
              thread_unlock(td);
      }
      
      void
      sched_preempt(struct thread *td)
      {
              struct tdq *tdq;
              int flags;
      
              SDT_PROBE2(sched, , , surrender, td, td->td_proc);
      
              thread_lock(td);
              tdq = TDQ_SELF();
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
              if (td->td_priority > tdq->tdq_lowpri) {
                      if (td->td_critnest == 1) {
                              flags = SW_INVOL | SW_PREEMPT;
                              flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE :
                                  SWT_REMOTEPREEMPT;
                              mi_switch(flags);
                              /* Switch dropped thread lock. */
                              return;
                      }
                      td->td_owepreempt = 1;
              } else {
                      tdq->tdq_owepreempt = 0;
              }
              thread_unlock(td);
      }
      
      /*
       * Fix priorities on return to user-space.  Priorities may be elevated due
       * to static priorities in msleep() or similar.
       */
      void
      sched_userret_slowpath(struct thread *td)
 6645 {
      
              thread_lock(td);
              td->td_priority = td->td_user_pri;
              td->td_base_pri = td->td_user_pri;
 6638         tdq_setlowpri(TDQ_SELF(), td);
              thread_unlock(td);
      }
      
      /*
       * Handle a stathz tick.  This is really only relevant for timeshare
       * threads.
       */
      void
      sched_clock(struct thread *td, int cnt)
      {
              struct tdq *tdq;
              struct td_sched *ts;
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              tdq = TDQ_SELF();
      #ifdef SMP
              /*
               * We run the long term load balancer infrequently on the first cpu.
               */
              if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 &&
                  balance_ticks != 0) {
                      balance_ticks -= cnt;
                      if (balance_ticks <= 0)
                              sched_balance();
              }
      #endif
              /*
               * Save the old switch count so we have a record of the last ticks
               * activity.   Initialize the new switch count based on our load.
               * If there is some activity seed it to reflect that.
               */
              tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
              tdq->tdq_switchcnt = tdq->tdq_load;
              /*
               * Advance the insert index once for each tick to ensure that all
               * threads get a chance to run.
               */
              if (tdq->tdq_idx == tdq->tdq_ridx) {
                      tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
                      if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
                              tdq->tdq_ridx = tdq->tdq_idx;
              }
              ts = td_get_sched(td);
              sched_pctcpu_update(ts, 1);
              if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td))
                      return;
      
              if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
                      /*
                       * We used a tick; charge it to the thread so
                       * that we can compute our interactivity.
                       */
                      td_get_sched(td)->ts_runtime += tickincr * cnt;
                      sched_interact_update(td);
                      sched_priority(td);
              }
      
              /*
               * Force a context switch if the current thread has used up a full
               * time slice (default is 100ms).
               */
              ts->ts_slice += cnt;
              if (ts->ts_slice >= tdq_slice(tdq)) {
                      ts->ts_slice = 0;
                      td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
              }
      }
      
      u_int
      sched_estcpu(struct thread *td __unused)
      {
      
              return (0);
      }
      
      /*
       * Return whether the current CPU has runnable tasks.  Used for in-kernel
       * cooperative idle threads.
       */
      int
      sched_runnable(void)
      {
              struct tdq *tdq;
              int load;
      
              load = 1;
      
              tdq = TDQ_SELF();
              if ((curthread->td_flags & TDF_IDLETD) != 0) {
                      if (tdq->tdq_load > 0)
                              goto out;
              } else
                      if (tdq->tdq_load - 1 > 0)
                              goto out;
              load = 0;
      out:
              return (load);
      }
      
      /*
       * Choose the highest priority thread to run.  The thread is removed from
       * the run-queue while running however the load remains.  For SMP we set
       * the tdq in the global idle bitmask if it idles here.
       */
      struct thread *
      sched_choose(void)
 23414 {
              struct thread *td;
              struct tdq *tdq;
      
              tdq = TDQ_SELF();
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
              td = tdq_choose(tdq);
              if (td) {
 23267                 tdq_runq_rem(tdq, td);
                      tdq->tdq_lowpri = td->td_priority;
                      return (td);
              }
 12974         tdq->tdq_lowpri = PRI_MAX_IDLE;
              return (PCPU_GET(idlethread));
      }
      
      /*
       * Set owepreempt if necessary.  Preemption never happens directly in ULE,
       * we always request it once we exit a critical section.
       */
      static inline void
      sched_setpreempt(struct thread *td)
      {
              struct thread *ctd;
              int cpri;
              int pri;
      
              THREAD_LOCK_ASSERT(curthread, MA_OWNED);
      
              ctd = curthread;
              pri = td->td_priority;
              cpri = ctd->td_priority;
 1346         if (pri < cpri)
 5170                 ctd->td_flags |= TDF_NEEDRESCHED;
 1346         if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
                      return;
              if (!sched_shouldpreempt(pri, cpri, 0))
                      return;
              ctd->td_owepreempt = 1;
      }
      
      /*
       * Add a thread to a thread queue.  Select the appropriate runq and add the
       * thread to it.  This is the internal function called when the tdq is
       * predetermined.
       */
      void
      tdq_add(struct tdq *tdq, struct thread *td, int flags)
 9675 {
      
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 9672         THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
              KASSERT((td->td_inhibitors == 0),
                  ("sched_add: trying to run inhibited thread"));
 9671         KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
                  ("sched_add: bad thread state"));
              KASSERT(td->td_flags & TDF_INMEM,
                  ("sched_add: thread swapped out"));
      
 6291         if (td->td_priority < tdq->tdq_lowpri)
 6285                 tdq->tdq_lowpri = td->td_priority;
              tdq_runq_add(tdq, td, flags);
              tdq_load_add(tdq, td);
      }
      
      /*
       * Select the target thread queue and add a thread to it.  Request
       * preemption or IPI a remote processor if required.
       *
       * Requires the thread lock on entry, drops on exit.
       */
      void
      sched_add(struct thread *td, int flags)
 6299 {
              struct tdq *tdq;
      #ifdef SMP
              int cpu;
      #endif
      
              KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
                  "prio:%d", td->td_priority, KTR_ATTR_LINKED,
                  sched_tdname(curthread));
              KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
                  KTR_ATTR_LINKED, sched_tdname(td));
 6299         SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
                  flags & SRQ_PREEMPTED);
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              /*
               * Recalculate the priority before we select the target cpu or
               * run-queue.
               */
 1613         if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 5070                 sched_priority(td);
      #ifdef SMP
              /*
               * Pick the destination cpu and if it isn't ours transfer to the
               * target cpu.
               */
              cpu = sched_pickcpu(td, flags);
              tdq = sched_setcpu(td, cpu, flags);
              tdq_add(tdq, td, flags);
              if (cpu != PCPU_GET(cpuid))
 4411                 tdq_notify(tdq, td);
              else if (!(flags & SRQ_YIELDING))
                      sched_setpreempt(td);
      #else
              tdq = TDQ_SELF();
              /*
               * Now that the thread is moving to the run-queue, set the lock
               * to the scheduler's lock.
               */
              if (td->td_lock != TDQ_LOCKPTR(tdq)) {
                      TDQ_LOCK(tdq);
                      if ((flags & SRQ_HOLD) != 0)
                              td->td_lock = TDQ_LOCKPTR(tdq);
                      else
                              thread_lock_set(td, TDQ_LOCKPTR(tdq));
              }
              tdq_add(tdq, td, flags);
              if (!(flags & SRQ_YIELDING))
                      sched_setpreempt(td);
      #endif
   49         if (!(flags & SRQ_HOLDTD))
 6251                 thread_unlock(td);
      }
      
      /*
       * Remove a thread from a run-queue without running it.  This is used
       * when we're stealing a thread from a remote queue.  Otherwise all threads
       * exit by calling sched_exit_thread() and sched_throw() themselves.
       */
      void
      sched_rem(struct thread *td)
 5900 {
              struct tdq *tdq;
      
              KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
                  "prio:%d", td->td_priority);
 5900         SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
              tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
              TDQ_LOCK_ASSERT(tdq, MA_OWNED);
              MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
              KASSERT(TD_ON_RUNQ(td),
                  ("sched_rem: thread not on run queue"));
              tdq_runq_rem(tdq, td);
              tdq_load_rem(tdq, td);
              TD_SET_CAN_RUN(td);
 2802         if (td->td_priority == tdq->tdq_lowpri)
                      tdq_setlowpri(tdq, NULL);
      }
      
      /*
       * Fetch cpu utilization information.  Updates on demand.
       */
      fixpt_t
      sched_pctcpu(struct thread *td)
      {
              fixpt_t pctcpu;
              struct td_sched *ts;
      
              pctcpu = 0;
              ts = td_get_sched(td);
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              sched_pctcpu_update(ts, TD_IS_RUNNING(td));
              if (ts->ts_ticks) {
                      int rtick;
      
                      /* How many rtick per second ? */
                      rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
                      pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
              }
      
              return (pctcpu);
      }
      
      /*
       * Enforce affinity settings for a thread.  Called after adjustments to
       * cpumask.
       */
      void
      sched_affinity(struct thread *td)
      {
      #ifdef SMP
              struct td_sched *ts;
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              ts = td_get_sched(td);
              if (THREAD_CAN_SCHED(td, ts->ts_cpu))
                      return;
              if (TD_ON_RUNQ(td)) {
                      sched_rem(td);
                      sched_add(td, SRQ_BORING | SRQ_HOLDTD);
                      return;
              }
              if (!TD_IS_RUNNING(td))
                      return;
              /*
               * Force a switch before returning to userspace.  If the
               * target thread is not running locally send an ipi to force
               * the issue.
               */
              td->td_flags |= TDF_NEEDRESCHED;
              if (td != curthread)
                      ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
      #endif
      }
      
      /*
       * Bind a thread to a target cpu.
       */
      void
      sched_bind(struct thread *td, int cpu)
      {
              struct td_sched *ts;
      
              THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
              KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
              ts = td_get_sched(td);
              if (ts->ts_flags & TSF_BOUND)
                      sched_unbind(td);
              KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
              ts->ts_flags |= TSF_BOUND;
              sched_pin();
              if (PCPU_GET(cpuid) == cpu)
                      return;
              ts->ts_cpu = cpu;
              /* When we return from mi_switch we'll be on the correct cpu. */
              mi_switch(SW_VOL);
              thread_lock(td);
      }
      
      /*
       * Release a bound thread.
       */
      void
      sched_unbind(struct thread *td)
      {
              struct td_sched *ts;
      
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
              ts = td_get_sched(td);
              if ((ts->ts_flags & TSF_BOUND) == 0)
                      return;
              ts->ts_flags &= ~TSF_BOUND;
              sched_unpin();
      }
      
      int
      sched_is_bound(struct thread *td)
      {
              THREAD_LOCK_ASSERT(td, MA_OWNED);
              return (td_get_sched(td)->ts_flags & TSF_BOUND);
      }
      
      /*
       * Basic yield call.
       */
      void
      sched_relinquish(struct thread *td)
      {
              thread_lock(td);
              mi_switch(SW_VOL | SWT_RELINQUISH);
      }
      
      /*
       * Return the total system load.
       */
      int
      sched_load(void)
      {
      #ifdef SMP
              int total;
              int i;
      
              total = 0;
              CPU_FOREACH(i)
                      total += TDQ_CPU(i)->tdq_sysload;
              return (total);
      #else
              return (TDQ_SELF()->tdq_sysload);
      #endif
      }
      
      int
      sched_sizeof_proc(void)
      {
              return (sizeof(struct proc));
      }
      
      int
      sched_sizeof_thread(void)
      {
              return (sizeof(struct thread) + sizeof(struct td_sched));
      }
      
      #ifdef SMP
      #define        TDQ_IDLESPIN(tdq)                                                \
          ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
      #else
      #define        TDQ_IDLESPIN(tdq)        1
      #endif
      
      /*
       * The actual idle process.
       */
      void
      sched_idletd(void *dummy)
      {
              struct thread *td;
              struct tdq *tdq;
              int oldswitchcnt, switchcnt;
              int i;
      
              mtx_assert(&Giant, MA_NOTOWNED);
              td = curthread;
              tdq = TDQ_SELF();
              THREAD_NO_SLEEPING();
              oldswitchcnt = -1;
              for (;;) {
                      if (tdq->tdq_load) {
                              thread_lock(td);
                              mi_switch(SW_VOL | SWT_IDLE);
                      }
                      switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
      #ifdef SMP
                      if (always_steal || switchcnt != oldswitchcnt) {
                              oldswitchcnt = switchcnt;
                              if (tdq_idled(tdq) == 0)
                                      continue;
                      }
                      switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
      #else
                      oldswitchcnt = switchcnt;
      #endif
                      /*
                       * If we're switching very frequently, spin while checking
                       * for load rather than entering a low power state that 
                       * may require an IPI.  However, don't do any busy
                       * loops while on SMT machines as this simply steals
                       * cycles from cores doing useful work.
                       */
                      if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
                              for (i = 0; i < sched_idlespins; i++) {
                                      if (tdq->tdq_load)
                                              break;
                                      cpu_spinwait();
                              }
                      }
      
                      /* If there was context switch during spin, restart it. */
                      switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
                      if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
                              continue;
      
                      /* Run main MD idle handler. */
                      tdq->tdq_cpu_idle = 1;
                      /*
                       * Make sure that tdq_cpu_idle update is globally visible
                       * before cpu_idle() read tdq_load.  The order is important
                       * to avoid race with tdq_notify.
                       */
                      atomic_thread_fence_seq_cst();
                      /*
                       * Checking for again after the fence picks up assigned
                       * threads often enough to make it worthwhile to do so in
                       * order to avoid calling cpu_idle().
                       */
                      if (tdq->tdq_load != 0) {
                              tdq->tdq_cpu_idle = 0;
                              continue;
                      }
                      cpu_idle(switchcnt * 4 > sched_idlespinthresh);
                      tdq->tdq_cpu_idle = 0;
      
                      /*
                       * Account thread-less hardware interrupts and
                       * other wakeup reasons equal to context switches.
                       */
                      switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
                      if (switchcnt != oldswitchcnt)
                              continue;
                      tdq->tdq_switchcnt++;
                      oldswitchcnt++;
              }
      }
      
      /*
       * A CPU is entering for the first time or a thread is exiting.
       */
      void
      sched_throw(struct thread *td)
      {
              struct thread *newtd;
              struct tdq *tdq;
      
              if (__predict_false(td == NULL)) {
      #ifdef SMP
                      PCPU_SET(sched, DPCPU_PTR(tdq));
      #endif
                      /* Correct spinlock nesting and acquire the correct lock. */
                      tdq = TDQ_SELF();
                      TDQ_LOCK(tdq);
                      spinlock_exit();
                      PCPU_SET(switchtime, cpu_ticks());
                      PCPU_SET(switchticks, ticks);
                      PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(tdq);
              } else {
                      tdq = TDQ_SELF();
                      THREAD_LOCK_ASSERT(td, MA_OWNED);
                      THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq));
                      tdq_load_rem(tdq, td);
                      td->td_lastcpu = td->td_oncpu;
                      td->td_oncpu = NOCPU;
                      thread_lock_block(td);
              }
              newtd = choosethread();
              spinlock_enter();
              TDQ_UNLOCK(tdq);
              KASSERT(curthread->td_md.md_spinlock_count == 1,
                  ("invalid count %d", curthread->td_md.md_spinlock_count));
              /* doesn't return */
              if (__predict_false(td == NULL))
                      cpu_throw(td, newtd);                /* doesn't return */
              else
                      cpu_switch(td, newtd, TDQ_LOCKPTR(tdq));
      }
      
      /*
       * This is called from fork_exit().  Just acquire the correct locks and
       * let fork do the rest of the work.
       */
      void
      sched_fork_exit(struct thread *td)
      {
              struct tdq *tdq;
              int cpuid;
      
              /*
               * Finish setting up thread glue so that it begins execution in a
               * non-nested critical section with the scheduler lock held.
               */
              KASSERT(curthread->td_md.md_spinlock_count == 1,
                  ("invalid count %d", curthread->td_md.md_spinlock_count));
              cpuid = PCPU_GET(cpuid);
              tdq = TDQ_SELF();
              TDQ_LOCK(tdq);
              spinlock_exit();
              MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
              td->td_oncpu = cpuid;
              KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
                  "prio:%d", td->td_priority);
              SDT_PROBE0(sched, , , on__cpu);
      }
      
      /*
       * Create on first use to catch odd startup conditons.
       */
      char *
      sched_tdname(struct thread *td)
      {
      #ifdef KTR
              struct td_sched *ts;
      
              ts = td_get_sched(td);
              if (ts->ts_name[0] == '\0')
                      snprintf(ts->ts_name, sizeof(ts->ts_name),
                          "%s tid %d", td->td_name, td->td_tid);
              return (ts->ts_name);
      #else
              return (td->td_name);
      #endif
      }
      
      #ifdef KTR
      void
      sched_clear_tdname(struct thread *td)
      {
              struct td_sched *ts;
      
              ts = td_get_sched(td);
              ts->ts_name[0] = '\0';
      }
      #endif
      
      #ifdef SMP
      
      /*
       * Build the CPU topology dump string. Is recursively called to collect
       * the topology tree.
       */
      static int
      sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
          int indent)
      {
              char cpusetbuf[CPUSETBUFSIZ];
              int i, first;
      
              sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
                  "", 1 + indent / 2, cg->cg_level);
              sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
                  cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
              first = TRUE;
              for (i = 0; i < MAXCPU; i++) {
                      if (CPU_ISSET(i, &cg->cg_mask)) {
                              if (!first)
                                      sbuf_printf(sb, ", ");
                              else
                                      first = FALSE;
                              sbuf_printf(sb, "%d", i);
                      }
              }
              sbuf_printf(sb, "</cpu>\n");
      
              if (cg->cg_flags != 0) {
                      sbuf_printf(sb, "%*s <flags>", indent, "");
                      if ((cg->cg_flags & CG_FLAG_HTT) != 0)
                              sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
                      if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
                              sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
                      if ((cg->cg_flags & CG_FLAG_SMT) != 0)
                              sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
                      sbuf_printf(sb, "</flags>\n");
              }
      
              if (cg->cg_children > 0) {
                      sbuf_printf(sb, "%*s <children>\n", indent, "");
                      for (i = 0; i < cg->cg_children; i++)
                              sysctl_kern_sched_topology_spec_internal(sb, 
                                  &cg->cg_child[i], indent+2);
                      sbuf_printf(sb, "%*s </children>\n", indent, "");
              }
              sbuf_printf(sb, "%*s</group>\n", indent, "");
              return (0);
      }
      
      /*
       * Sysctl handler for retrieving topology dump. It's a wrapper for
       * the recursive sysctl_kern_smp_topology_spec_internal().
       */
      static int
      sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
      {
              struct sbuf *topo;
              int err;
      
              KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
      
              topo = sbuf_new_for_sysctl(NULL, NULL, 512, req);
              if (topo == NULL)
                      return (ENOMEM);
      
              sbuf_printf(topo, "<groups>\n");
              err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
              sbuf_printf(topo, "</groups>\n");
      
              if (err == 0) {
                      err = sbuf_finish(topo);
              }
              sbuf_delete(topo);
              return (err);
      }
      
      #endif
      
      static int
      sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
      {
              int error, new_val, period;
      
              period = 1000000 / realstathz;
              new_val = period * sched_slice;
              error = sysctl_handle_int(oidp, &new_val, 0, req);
              if (error != 0 || req->newptr == NULL)
                      return (error);
              if (new_val <= 0)
                      return (EINVAL);
              sched_slice = imax(1, (new_val + period / 2) / period);
              sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
              hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
                  realstathz);
              return (0);
      }
      
      SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Scheduler");
      SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
          "Scheduler name");
      SYSCTL_PROC(_kern_sched, OID_AUTO, quantum,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
          sysctl_kern_quantum, "I",
          "Quantum for timeshare threads in microseconds");
      SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
          "Quantum for timeshare threads in stathz ticks");
      SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
          "Interactivity score threshold");
      SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
          &preempt_thresh, 0,
          "Maximal (lowest) priority for preemption");
      SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
          "Assign static kernel priorities to sleeping threads");
      SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
          "Number of times idle thread will spin waiting for new work");
      SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
          &sched_idlespinthresh, 0,
          "Threshold before we will permit idle thread spinning");
      #ifdef SMP
      SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
          "Number of hz ticks to keep thread affinity for");
      SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
          "Enables the long-term load balancer");
      SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
          &balance_interval, 0,
          "Average period in stathz ticks to run the long-term balancer");
      SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
          "Attempts to steal work from other cores before idling");
      SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
          "Minimum load on remote CPU before we'll steal");
      SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit,
          0, "Topological distance limit for stealing threads in sched_switch()");
      SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0,
          "Always run the stealer from the idle thread");
      SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
          CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
          "XML dump of detected CPU topology");
      #endif
      
      /* ps compat.  All cpu percentages from ULE are weighted. */
      static int ccpu = 0;
      SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0,
          "Decay factor used for updating %CPU in 4BSD scheduler");
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2007 Attilio Rao <attilio@freebsd.org>
       * Copyright (c) 2001 Jason Evans <jasone@freebsd.org>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice(s), this list of conditions and the following disclaimer as
       *    the first lines of this file unmodified other than the possible
       *    addition of one or more copyright notices.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice(s), this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
       * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
       * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
       * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
       * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
       * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
       * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
       * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
       * DAMAGE.
       */
      
      /*
       * Shared/exclusive locks.  This implementation attempts to ensure
       * deterministic lock granting behavior, so that slocks and xlocks are
       * interleaved.
       *
       * Priority propagation will not generally raise the priority of lock holders,
       * so should not be relied upon in combination with sx locks.
       */
      
      #include "opt_ddb.h"
      #include "opt_hwpmc_hooks.h"
      #include "opt_no_adaptive_sx.h"
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kdb.h>
      #include <sys/kernel.h>
      #include <sys/ktr.h>
      #include <sys/lock.h>
      #include <sys/mutex.h>
      #include <sys/proc.h>
      #include <sys/sched.h>
      #include <sys/sleepqueue.h>
      #include <sys/sx.h>
      #include <sys/smp.h>
      #include <sys/sysctl.h>
      
      #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
      #include <machine/cpu.h>
      #endif
      
      #ifdef DDB
      #include <ddb/ddb.h>
      #endif
      
      #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
      #define        ADAPTIVE_SX
      #endif
      
      #ifdef HWPMC_HOOKS
      #include <sys/pmckern.h>
      PMC_SOFT_DECLARE( , , lock, failed);
      #endif
      
      /* Handy macros for sleep queues. */
      #define        SQ_EXCLUSIVE_QUEUE        0
      #define        SQ_SHARED_QUEUE                1
      
      /*
       * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file.  We
       * drop Giant anytime we have to sleep or if we adaptively spin.
       */
      #define        GIANT_DECLARE                                                        \
              int _giantcnt = 0;                                                \
              WITNESS_SAVE_DECL(Giant)                                        \
      
      #define        GIANT_SAVE(work) do {                                                \
              if (__predict_false(mtx_owned(&Giant))) {                        \
                      work++;                                                        \
                      WITNESS_SAVE(&Giant.lock_object, Giant);                \
                      while (mtx_owned(&Giant)) {                                \
                              _giantcnt++;                                        \
                              mtx_unlock(&Giant);                                \
                      }                                                        \
              }                                                                \
      } while (0)
      
      #define GIANT_RESTORE() do {                                                \
              if (_giantcnt > 0) {                                                \
                      mtx_assert(&Giant, MA_NOTOWNED);                        \
                      while (_giantcnt--)                                        \
                              mtx_lock(&Giant);                                \
                      WITNESS_RESTORE(&Giant.lock_object, Giant);                \
              }                                                                \
      } while (0)
      
      /*
       * Returns true if an exclusive lock is recursed.  It assumes
       * curthread currently has an exclusive lock.
       */
      #define        sx_recursed(sx)                ((sx)->sx_recurse != 0)
      
      static void        assert_sx(const struct lock_object *lock, int what);
      #ifdef DDB
      static void        db_show_sx(const struct lock_object *lock);
      #endif
      static void        lock_sx(struct lock_object *lock, uintptr_t how);
      #ifdef KDTRACE_HOOKS
      static int        owner_sx(const struct lock_object *lock, struct thread **owner);
      #endif
      static uintptr_t unlock_sx(struct lock_object *lock);
      
      struct lock_class lock_class_sx = {
              .lc_name = "sx",
              .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
              .lc_assert = assert_sx,
      #ifdef DDB
              .lc_ddb_show = db_show_sx,
      #endif
              .lc_lock = lock_sx,
              .lc_unlock = unlock_sx,
      #ifdef KDTRACE_HOOKS
              .lc_owner = owner_sx,
      #endif
      };
      
      #ifndef INVARIANTS
      #define        _sx_assert(sx, what, file, line)
      #endif
      
      #ifdef ADAPTIVE_SX
      #ifdef SX_CUSTOM_BACKOFF
      static u_short __read_frequently asx_retries;
      static u_short __read_frequently asx_loops;
      static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
          "sxlock debugging");
      SYSCTL_U16(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
      SYSCTL_U16(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
      
      static struct lock_delay_config __read_frequently sx_delay;
      
      SYSCTL_U16(_debug_sx, OID_AUTO, delay_base, CTLFLAG_RW, &sx_delay.base,
          0, "");
      SYSCTL_U16(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max,
          0, "");
      
      static void
      sx_lock_delay_init(void *arg __unused)
      {
      
              lock_delay_default_init(&sx_delay);
              asx_retries = 10;
              asx_loops = max(10000, sx_delay.max);
      }
      LOCK_DELAY_SYSINIT(sx_lock_delay_init);
      #else
      #define sx_delay        locks_delay
      #define asx_retries        locks_delay_retries
      #define asx_loops        locks_delay_loops
      #endif
      #endif
      
      void
      assert_sx(const struct lock_object *lock, int what)
      {
      
              sx_assert((const struct sx *)lock, what);
      }
      
      void
      lock_sx(struct lock_object *lock, uintptr_t how)
  422 {
              struct sx *sx;
      
              sx = (struct sx *)lock;
              if (how)
                      sx_slock(sx);
              else
  422                 sx_xlock(sx);
      }
      
      uintptr_t
      unlock_sx(struct lock_object *lock)
  754 {
              struct sx *sx;
      
              sx = (struct sx *)lock;
              sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
              if (sx_xlocked(sx)) {
  750                 sx_xunlock(sx);
                      return (0);
              } else {
                      sx_sunlock(sx);
                      return (1);
              }
      }
      
      #ifdef KDTRACE_HOOKS
      int
      owner_sx(const struct lock_object *lock, struct thread **owner)
      {
              const struct sx *sx;
              uintptr_t x;
      
              sx = (const struct sx *)lock;
              x = sx->sx_lock;
              *owner = NULL;
              return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) :
                  ((*owner = (struct thread *)SX_OWNER(x)) != NULL));
      }
      #endif
      
      void
      sx_sysinit(void *arg)
      {
              struct sx_args *sargs = arg;
      
              sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
      }
      
      void
      sx_init_flags(struct sx *sx, const char *description, int opts)
 1326 {
              int flags;
      
              MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
                  SX_NOPROFILE | SX_NEW)) == 0);
              ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock,
                  ("%s: sx_lock not aligned for %s: %p", __func__, description,
                  &sx->sx_lock));
      
              flags = LO_SLEEPABLE | LO_UPGRADABLE;
              if (opts & SX_DUPOK)
                      flags |= LO_DUPOK;
              if (opts & SX_NOPROFILE)
                      flags |= LO_NOPROFILE;
              if (!(opts & SX_NOWITNESS))
                      flags |= LO_WITNESS;
              if (opts & SX_RECURSE)
                      flags |= LO_RECURSABLE;
              if (opts & SX_QUIET)
                      flags |= LO_QUIET;
              if (opts & SX_NEW)
                      flags |= LO_NEW;
      
              lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
              sx->sx_lock = SX_LOCK_UNLOCKED;
              sx->sx_recurse = 0;
      }
      
      void
      sx_destroy(struct sx *sx)
  895 {
      
              KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
  888         KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
              sx->sx_lock = SX_LOCK_DESTROYED;
              lock_destroy(&sx->lock_object);
      }
      
      int
      sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
   88 {
              uintptr_t x;
      
              if (SCHEDULER_STOPPED())
                      return (1);
      
   88         KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
                  ("sx_try_slock() by idle thread %p on sx %s @ %s:%d",
                  curthread, sx->lock_object.lo_name, file, line));
      
              x = sx->sx_lock;
              for (;;) {
                      KASSERT(x != SX_LOCK_DESTROYED,
                          ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
                      if (!(x & SX_LOCK_SHARED))
                              break;
                      if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, x + SX_ONE_SHARER)) {
                              LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
                              WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
   88                         LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire,
                                  sx, 0, 0, file, line, LOCKSTAT_READER);
                              TD_LOCKS_INC(curthread);
                              curthread->td_sx_slocks++;
                              return (1);
                      }
              }
      
              LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
              return (0);
      }
      
      int
      sx_try_slock_(struct sx *sx, const char *file, int line)
   88 {
      
              return (sx_try_slock_int(sx LOCK_FILE_LINE_ARG));
      }
      
      int
      _sx_xlock(struct sx *sx, int opts, const char *file, int line)
 12715 {
              uintptr_t tid, x;
              int error = 0;
      
 12851         KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
                  !TD_IS_IDLETHREAD(curthread),
                  ("sx_xlock() by idle thread %p on sx %s @ %s:%d",
                  curthread, sx->lock_object.lo_name, file, line));
              KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
                  ("sx_xlock() of destroyed sx @ %s:%d", file, line));
              WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
                  line, NULL);
              tid = (uintptr_t)curthread;
              x = SX_LOCK_UNLOCKED;
              if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
                      error = _sx_xlock_hard(sx, x, opts LOCK_FILE_LINE_ARG);
              else
 12842                 LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx,
                          0, 0, file, line, LOCKSTAT_WRITER);
   47         if (!error) {
                      LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
                          file, line);
                      WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
                      TD_LOCKS_INC(curthread);
              }
      
              return (error);
      }
      
      int
      sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
  241 {
              struct thread *td;
              uintptr_t tid, x;
              int rval;
              bool recursed;
      
              td = curthread;
              tid = (uintptr_t)td;
              if (SCHEDULER_STOPPED_TD(td))
                      return (1);
      
  241         KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td),
                  ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d",
                  curthread, sx->lock_object.lo_name, file, line));
              KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
                  ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
      
              rval = 1;
              recursed = false;
              x = SX_LOCK_UNLOCKED;
              for (;;) {
  238                 if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
                              break;
                      if (x == SX_LOCK_UNLOCKED)
                              continue;
    3                 if (x == tid && (sx->lock_object.lo_flags & LO_RECURSABLE)) {
                              sx->sx_recurse++;
                              atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
                              break;
                      }
                      rval = 0;
                      break;
              }
      
              LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
              if (rval) {
                      WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
                          file, line);
                      if (!recursed)
  238                         LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire,
                                  sx, 0, 0, file, line, LOCKSTAT_WRITER);
                      TD_LOCKS_INC(curthread);
              }
      
              return (rval);
      }
      
      int
      sx_try_xlock_(struct sx *sx, const char *file, int line)
  241 {
      
              return (sx_try_xlock_int(sx LOCK_FILE_LINE_ARG));
      }
      
      void
      _sx_xunlock(struct sx *sx, const char *file, int line)
 10738 {
      
              KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
                  ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
              _sx_assert(sx, SA_XLOCKED, file, line);
              WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
              LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
                  line);
      #if LOCK_DEBUG > 0
              _sx_xunlock_hard(sx, (uintptr_t)curthread, file, line);
      #else
              __sx_xunlock(sx, curthread, file, line);
      #endif
 10750         TD_LOCKS_DEC(curthread);
      }
      
      /*
       * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
       * This will only succeed if this thread holds a single shared lock.
       * Return 1 if if the upgrade succeed, 0 otherwise.
       */
      int
      sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 6073 {
              uintptr_t x;
              uintptr_t waiters;
              int success;
      
              if (SCHEDULER_STOPPED())
                      return (1);
      
              KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
                  ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
              _sx_assert(sx, SA_SLOCKED, file, line);
      
              /*
               * Try to switch from one shared lock to an exclusive lock.  We need
               * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
               * we will wake up the exclusive waiters when we drop the lock.
               */
              success = 0;
              x = SX_READ_VALUE(sx);
              for (;;) {
  168                 if (SX_SHARERS(x) > 1)
                              break;
                      waiters = (x & SX_LOCK_WAITERS);
    1                 if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x,
                          (uintptr_t)curthread | waiters)) {
                              success = 1;
                              break;
                      }
              }
              LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
              if (success) {
                      curthread->td_sx_slocks--;
                      WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
                          file, line);
 6050                 LOCKSTAT_RECORD0(sx__upgrade, sx);
              }
              return (success);
      }
      
      int
      sx_try_upgrade_(struct sx *sx, const char *file, int line)
 5994 {
      
              return (sx_try_upgrade_int(sx LOCK_FILE_LINE_ARG));
      }
      
      /*
       * Downgrade an unrecursed exclusive lock into a single shared lock.
       */
      void
      sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 6157 {
              uintptr_t x;
              int wakeup_swapper;
      
              if (SCHEDULER_STOPPED())
                      return;
      
              KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
                  ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
              _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
      #ifndef INVARIANTS
              if (sx_recursed(sx))
                      panic("downgrade of a recursed lock");
      #endif
      
              WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
      
              /*
               * Try to switch from an exclusive lock with no shared waiters
               * to one sharer with no shared waiters.  If there are
               * exclusive waiters, we don't need to lock the sleep queue so
               * long as we preserve the flag.  We do one quick try and if
               * that fails we grab the sleepq lock to keep the flags from
               * changing and do it the slow way.
               *
               * We have to lock the sleep queue if there are shared waiters
               * so we can wake them up.
               */
              x = sx->sx_lock;
 6163         if (!(x & SX_LOCK_SHARED_WAITERS) &&
                  atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
                  (x & SX_LOCK_EXCLUSIVE_WAITERS)))
                      goto out;
      
              /*
               * Lock the sleep queue so we can read the waiters bits
               * without any races and wakeup any shared waiters.
               */
              sleepq_lock(&sx->lock_object);
      
              /*
               * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
               * shared lock.  If there are any shared waiters, wake them up.
               */
              wakeup_swapper = 0;
              x = sx->sx_lock;
              atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
                  (x & SX_LOCK_EXCLUSIVE_WAITERS));
              if (x & SX_LOCK_SHARED_WAITERS)
                      wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
                          0, SQ_SHARED_QUEUE);
              sleepq_release(&sx->lock_object);
      
    2         if (wakeup_swapper)
                      kick_proc0();
      
      out:
              curthread->td_sx_slocks++;
              LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
 6164         LOCKSTAT_RECORD0(sx__downgrade, sx);
      }
      
      void
      sx_downgrade_(struct sx *sx, const char *file, int line)
 6156 {
      
              sx_downgrade_int(sx LOCK_FILE_LINE_ARG);
      }
      
      #ifdef        ADAPTIVE_SX
      static inline void
      sx_drop_critical(uintptr_t x, bool *in_critical, int *extra_work)
      {
      
              if (x & SX_LOCK_WRITE_SPINNER)
                      return;
   41         if (*in_critical) {
                      critical_exit();
                      *in_critical = false;
                      (*extra_work)--;
              }
      }
      #else
      #define sx_drop_critical(x, in_critical, extra_work) do { } while(0)
      #endif
      
      /*
       * This function represents the so-called 'hard case' for sx_xlock
       * operation.  All 'easy case' failures are redirected to this.  Note
       * that ideally this would be a static function, but it needs to be
       * accessible from at least sx.h.
       */
      int
      _sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF)
   48 {
              GIANT_DECLARE;
              uintptr_t tid, setx;
      #ifdef ADAPTIVE_SX
              volatile struct thread *owner;
              u_int i, n, spintries = 0;
              enum { READERS, WRITER } sleep_reason = READERS;
              bool in_critical = false;
      #endif
      #ifdef LOCK_PROFILING
              uint64_t waittime = 0;
              int contested = 0;
      #endif
              int error = 0;
      #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS)
              struct lock_delay_arg lda;
      #endif
      #ifdef        KDTRACE_HOOKS
              u_int sleep_cnt = 0;
              int64_t sleep_time = 0;
              int64_t all_time = 0;
      #endif
      #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
              uintptr_t state = 0;
              int doing_lockprof = 0;
      #endif
              int extra_work = 0;
      
              tid = (uintptr_t)curthread;
      
      #ifdef KDTRACE_HOOKS
   48         if (LOCKSTAT_PROFILE_ENABLED(sx__acquire)) {
                      while (x == SX_LOCK_UNLOCKED) {
                              if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
                                      goto out_lockstat;
                      }
                      extra_work = 1;
                      doing_lockprof = 1;
                      all_time -= lockstat_nsecs(&sx->lock_object);
                      state = x;
              }
      #endif
      #ifdef LOCK_PROFILING
              extra_work = 1;
              doing_lockprof = 1;
              state = x;
      #endif
      
              if (SCHEDULER_STOPPED())
                      return (0);
      
      #if defined(ADAPTIVE_SX)
              lock_delay_arg_init(&lda, &sx_delay);
      #elif defined(KDTRACE_HOOKS)
              lock_delay_arg_init_noadapt(&lda);
      #endif
      
   48         if (__predict_false(x == SX_LOCK_UNLOCKED))
                      x = SX_READ_VALUE(sx);
      
              /* If we already hold an exclusive lock, then recurse. */
              if (__predict_false(lv_sx_owner(x) == (struct thread *)tid)) {
                      KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0,
                  ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
                          sx->lock_object.lo_name, file, line));
                      sx->sx_recurse++;
                      atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                              CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
                      return (0);
              }
      
              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                      CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
                          sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
      
      #ifdef HWPMC_HOOKS
   48         PMC_SOFT_CALL( , , lock, failed);
      #endif
              lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
                  &waittime);
      
      #ifndef INVARIANTS
              GIANT_SAVE(extra_work);
      #endif
      
              for (;;) {
                      if (x == SX_LOCK_UNLOCKED) {
   34                         if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
                                      break;
                              continue;
                      }
      #ifdef INVARIANTS
   48                 GIANT_SAVE(extra_work);
      #endif
      #ifdef KDTRACE_HOOKS
                      lda.spin_cnt++;
      #endif
      #ifdef ADAPTIVE_SX
                      if (x == (SX_LOCK_SHARED | SX_LOCK_WRITE_SPINNER)) {
    6                         if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
                                      break;
                              continue;
                      }
      
                      /*
                       * If the lock is write locked and the owner is
                       * running on another CPU, spin until the owner stops
                       * running or the state of the lock changes.
                       */
                      if ((x & SX_LOCK_SHARED) == 0) {
                              sx_drop_critical(x, &in_critical, &extra_work);
                              sleep_reason = WRITER;
                              owner = lv_sx_owner(x);
   26                         if (!TD_IS_RUNNING(owner))
                                      goto sleepq;
                              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                      CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
                                          __func__, sx, owner);
                              KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
                                  "spinning", "lockname:\"%s\"",
                                  sx->lock_object.lo_name);
                              do {
                                      lock_delay(&lda);
                                      x = SX_READ_VALUE(sx);
                                      owner = lv_sx_owner(x);
   19                         } while (owner != NULL && TD_IS_RUNNING(owner));
                              KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
                                  "running");
                              continue;
    1                 } else if (SX_SHARERS(x) > 0) {
                              sleep_reason = READERS;
                              if (spintries == asx_retries)
                                      goto sleepq;
                              if (!(x & SX_LOCK_WRITE_SPINNER)) {
                                      if (!in_critical) {
    8                                         critical_enter();
                                              in_critical = true;
                                              extra_work++;
                                      }
    8                                 if (!atomic_fcmpset_ptr(&sx->sx_lock, &x,
                                          x | SX_LOCK_WRITE_SPINNER)) {
                                              critical_exit();
                                              in_critical = false;
                                              extra_work--;
                                              continue;
                                      }
                              }
                              spintries++;
                              KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
                                  "spinning", "lockname:\"%s\"",
                                  sx->lock_object.lo_name);
                              n = SX_SHARERS(x);
    8                         for (i = 0; i < asx_loops; i += n) {
    8                                 lock_delay_spin(n);
                                      x = SX_READ_VALUE(sx);
                                      if (!(x & SX_LOCK_WRITE_SPINNER))
                                              break;
                                      if (!(x & SX_LOCK_SHARED))
                                              break;
                                      n = SX_SHARERS(x);
    6                                 if (n == 0)
                                              break;
                              }
      #ifdef KDTRACE_HOOKS
                              lda.spin_cnt += i;
      #endif
                              KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
                                  "running");
    8                         if (i < asx_loops)
                                      continue;
                      }
      sleepq:
      #endif
                      sleepq_lock(&sx->lock_object);
                      x = SX_READ_VALUE(sx);
      retry_sleepq:
      
                      /*
                       * If the lock was released while spinning on the
                       * sleep queue chain lock, try again.
                       */
                      if (x == SX_LOCK_UNLOCKED) {
                              sleepq_release(&sx->lock_object);
                              sx_drop_critical(x, &in_critical, &extra_work);
                              continue;
                      }
      
      #ifdef ADAPTIVE_SX
                      /*
                       * The current lock owner might have started executing
                       * on another CPU (or the lock could have changed
                       * owners) while we were waiting on the sleep queue
                       * chain lock.  If so, drop the sleep queue lock and try
                       * again.
                       */
                      if (!(x & SX_LOCK_SHARED)) {
                              owner = (struct thread *)SX_OWNER(x);
   26                         if (TD_IS_RUNNING(owner)) {
                                      sleepq_release(&sx->lock_object);
                                      sx_drop_critical(x, &in_critical,
                                          &extra_work);
                                      continue;
                              }
    2                 } else if (SX_SHARERS(x) > 0 && sleep_reason == WRITER) {
                              sleepq_release(&sx->lock_object);
                              sx_drop_critical(x, &in_critical, &extra_work);
                              continue;
                      }
      #endif
      
                      /*
                       * If an exclusive lock was released with both shared
                       * and exclusive waiters and a shared waiter hasn't
                       * woken up and acquired the lock yet, sx_lock will be
                       * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
                       * If we see that value, try to acquire it once.  Note
                       * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
                       * as there are other exclusive waiters still.  If we
                       * fail, restart the loop.
                       */
                      setx = x & (SX_LOCK_WAITERS | SX_LOCK_WRITE_SPINNER);
                      if ((x & ~setx) == SX_LOCK_SHARED) {
                              setx &= ~SX_LOCK_WRITE_SPINNER;
                              if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid | setx))
                                      goto retry_sleepq;
                              sleepq_release(&sx->lock_object);
                              CTR2(KTR_LOCK, "%s: %p claimed by new writer",
                                  __func__, sx);
                              break;
                      }
      
      #ifdef ADAPTIVE_SX
                      /*
                       * It is possible we set the SX_LOCK_WRITE_SPINNER bit.
                       * It is an invariant that when the bit is set, there is
                       * a writer ready to grab the lock. Thus clear the bit since
                       * we are going to sleep.
                       */
                      if (in_critical) {
                              if ((x & SX_LOCK_WRITE_SPINNER) ||
                                  !((x & SX_LOCK_EXCLUSIVE_WAITERS))) {
                                      setx = x & ~SX_LOCK_WRITE_SPINNER;
                                      setx |= SX_LOCK_EXCLUSIVE_WAITERS;
    2                                 if (!atomic_fcmpset_ptr(&sx->sx_lock, &x,
                                          setx)) {
                                              goto retry_sleepq;
                                      }
                              }
                              critical_exit();
                              in_critical = false;
                      } else {
      #endif
                              /*
                               * Try to set the SX_LOCK_EXCLUSIVE_WAITERS.  If we fail,
                               * than loop back and retry.
                               */
    1                         if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
   25                                 if (!atomic_fcmpset_ptr(&sx->sx_lock, &x,
                                          x | SX_LOCK_EXCLUSIVE_WAITERS)) {
                                              goto retry_sleepq;
                                      }
                                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                              CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
                                                  __func__, sx);
                              }
      #ifdef ADAPTIVE_SX
                      }
      #endif
      
                      /*
                       * Since we have been unable to acquire the exclusive
                       * lock and the exclusive waiters flag is set, we have
                       * to sleep.
                       */
                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                              CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
                                  __func__, sx);
      
      #ifdef KDTRACE_HOOKS
                      sleep_time -= lockstat_nsecs(&sx->lock_object);
      #endif
                      sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
                          SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
                          SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
                      if (!(opts & SX_INTERRUPTIBLE))
    5                         sleepq_wait(&sx->lock_object, 0);
                      else
   23                         error = sleepq_wait_sig(&sx->lock_object, 0);
      #ifdef KDTRACE_HOOKS
                      sleep_time += lockstat_nsecs(&sx->lock_object);
                      sleep_cnt++;
      #endif
                      if (error) {
                              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                      CTR2(KTR_LOCK,
                              "%s: interruptible sleep by %p suspended by signal",
                                          __func__, sx);
                              break;
                      }
                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                              CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
                                  __func__, sx);
   20                 x = SX_READ_VALUE(sx);
              }
   39         if (__predict_true(!extra_work))
                      return (error);
      #ifdef ADAPTIVE_SX
    2         if (in_critical)
                      critical_exit();
      #endif
    8         GIANT_RESTORE();
      #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
    8         if (__predict_true(!doing_lockprof))
                      return (error);
      #endif
      #ifdef KDTRACE_HOOKS
              all_time += lockstat_nsecs(&sx->lock_object);
              if (sleep_time)
                      LOCKSTAT_RECORD4(sx__block, sx, sleep_time,
                          LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0,
                          (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
              if (lda.spin_cnt > sleep_cnt)
                      LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time,
                          LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0,
                          (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
      out_lockstat:
      #endif
              if (!error)
                      LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx,
                          contested, waittime, file, line, LOCKSTAT_WRITER);
              return (error);
      }
      
      /*
       * This function represents the so-called 'hard case' for sx_xunlock
       * operation.  All 'easy case' failures are redirected to this.  Note
       * that ideally this would be a static function, but it needs to be
       * accessible from at least sx.h.
       */
      void
      _sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF)
 10742 {
              uintptr_t tid, setx;
              int queue, wakeup_swapper;
      
              if (SCHEDULER_STOPPED())
                      return;
      
              tid = (uintptr_t)curthread;
      
              if (__predict_false(x == tid))
 10741                 x = SX_READ_VALUE(sx);
      
              MPASS(!(x & SX_LOCK_SHARED));
      
              if (__predict_false(x & SX_LOCK_RECURSED)) {
                      /* The lock is recursed, unrecurse one level. */
                      if ((--sx->sx_recurse) == 0)
                              atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                              CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
                      return;
              }
      
 10748         LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_WRITER);
 10750         if (x == tid &&
                  atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED))
                      return;
      
              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                      CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
      
              sleepq_lock(&sx->lock_object);
              x = SX_READ_VALUE(sx);
              MPASS(x & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS));
      
              /*
               * The wake up algorithm here is quite simple and probably not
               * ideal.  It gives precedence to shared waiters if they are
               * present.  For this condition, we have to preserve the
               * state of the exclusive waiters flag.
               * If interruptible sleeps left the shared queue empty avoid a
               * starvation for the threads sleeping on the exclusive queue by giving
               * them precedence and cleaning up the shared waiters bit anyway.
               */
              setx = SX_LOCK_UNLOCKED;
              queue = SQ_SHARED_QUEUE;
    5         if ((x & SX_LOCK_EXCLUSIVE_WAITERS) != 0 &&
                  sleepq_sleepcnt(&sx->lock_object, SQ_EXCLUSIVE_QUEUE) != 0) {
                      queue = SQ_EXCLUSIVE_QUEUE;
                      setx |= (x & SX_LOCK_SHARED_WAITERS);
              }
              atomic_store_rel_ptr(&sx->sx_lock, setx);
      
              /* Wake up all the waiters for the specific queue. */
              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                      CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
                          __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
                          "exclusive");
      
              wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0,
                  queue);
              sleepq_release(&sx->lock_object);
    9         if (wakeup_swapper)
                      kick_proc0();
      }
      
      static bool __always_inline
      __sx_can_read(struct thread *td, uintptr_t x, bool fp)
      {
      
  458         if ((x & (SX_LOCK_SHARED | SX_LOCK_EXCLUSIVE_WAITERS | SX_LOCK_WRITE_SPINNER))
                              == SX_LOCK_SHARED)
                      return (true);
              if (!fp && td->td_sx_slocks && (x & SX_LOCK_SHARED))
                      return (true);
              return (false);
      }
      
      static bool __always_inline
      __sx_slock_try(struct sx *sx, struct thread *td, uintptr_t *xp, bool fp
          LOCK_FILE_LINE_ARG_DEF)
      {
      
              /*
               * If no other thread has an exclusive lock then try to bump up
               * the count of sharers.  Since we have to preserve the state
               * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
               * shared lock loop back and retry.
               */
  456         while (__sx_can_read(td, *xp, fp)) {
   12                 if (atomic_fcmpset_acq_ptr(&sx->sx_lock, xp,
                          *xp + SX_ONE_SHARER)) {
                              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                      CTR4(KTR_LOCK, "%s: %p succeed %p -> %p",
                                          __func__, sx, (void *)*xp,
                                          (void *)(*xp + SX_ONE_SHARER));
 10583                         td->td_sx_slocks++;
                              return (true);
                      }
              }
              return (false);
      }
      
      static int __noinline
      _sx_slock_hard(struct sx *sx, int opts, uintptr_t x LOCK_FILE_LINE_ARG_DEF)
  458 {
              GIANT_DECLARE;
              struct thread *td;
      #ifdef ADAPTIVE_SX
              volatile struct thread *owner;
              u_int i, n, spintries = 0;
      #endif
      #ifdef LOCK_PROFILING
              uint64_t waittime = 0;
              int contested = 0;
      #endif
              int error = 0;
      #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS)
              struct lock_delay_arg lda;
      #endif
      #ifdef KDTRACE_HOOKS
              u_int sleep_cnt = 0;
              int64_t sleep_time = 0;
              int64_t all_time = 0;
      #endif
      #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
              uintptr_t state = 0;
      #endif
              int extra_work = 0;
      
              td = curthread;
      
      #ifdef KDTRACE_HOOKS
  458         if (LOCKSTAT_PROFILE_ENABLED(sx__acquire)) {
                      if (__sx_slock_try(sx, td, &x, false LOCK_FILE_LINE_ARG))
                              goto out_lockstat;
                      extra_work = 1;
                      all_time -= lockstat_nsecs(&sx->lock_object);
                      state = x;
              }
      #endif
      #ifdef LOCK_PROFILING
              extra_work = 1;
              state = x;
      #endif
      
              if (SCHEDULER_STOPPED())
                      return (0);
      
      #if defined(ADAPTIVE_SX)
              lock_delay_arg_init(&lda, &sx_delay);
      #elif defined(KDTRACE_HOOKS)
              lock_delay_arg_init_noadapt(&lda);
      #endif
      
      #ifdef HWPMC_HOOKS
  458         PMC_SOFT_CALL( , , lock, failed);
      #endif
              lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
                  &waittime);
      
      #ifndef INVARIANTS
              GIANT_SAVE(extra_work);
      #endif
      
              /*
               * As with rwlocks, we don't make any attempt to try to block
               * shared locks once there is an exclusive waiter.
               */
              for (;;) {
  456                 if (__sx_slock_try(sx, td, &x, false LOCK_FILE_LINE_ARG))
                              break;
      #ifdef INVARIANTS
  458                 GIANT_SAVE(extra_work);
      #endif
      #ifdef KDTRACE_HOOKS
                      lda.spin_cnt++;
      #endif
      
      #ifdef ADAPTIVE_SX
                      /*
                       * If the owner is running on another CPU, spin until
                       * the owner stops running or the state of the lock
                       * changes.
                       */
                      if ((x & SX_LOCK_SHARED) == 0) {
                              owner = lv_sx_owner(x);
   23                         if (TD_IS_RUNNING(owner)) {
                                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                              CTR3(KTR_LOCK,
                                                  "%s: spinning on %p held by %p",
                                                  __func__, sx, owner);
                                      KTR_STATE1(KTR_SCHED, "thread",
                                          sched_tdname(curthread), "spinning",
                                          "lockname:\"%s\"", sx->lock_object.lo_name);
                                      do {
                                              lock_delay(&lda);
                                              x = SX_READ_VALUE(sx);
                                              owner = lv_sx_owner(x);
  457                                 } while (owner != NULL && TD_IS_RUNNING(owner));
                                      KTR_STATE0(KTR_SCHED, "thread",
                                          sched_tdname(curthread), "running");
                                      continue;
                              }
                      } else {
                              if ((x & SX_LOCK_WRITE_SPINNER) && SX_SHARERS(x) == 0) {
                                      MPASS(!__sx_can_read(td, x, false));
                                      lock_delay_spin(2);
                                      x = SX_READ_VALUE(sx);
                                      continue;
                              }
                              if (spintries < asx_retries) {
                                      KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
                                          "spinning", "lockname:\"%s\"",
                                          sx->lock_object.lo_name);
                                      n = SX_SHARERS(x);
                                      for (i = 0; i < asx_loops; i += n) {
                                              lock_delay_spin(n);
                                              x = SX_READ_VALUE(sx);
                                              if (!(x & SX_LOCK_SHARED))
                                                      break;
                                              n = SX_SHARERS(x);
                                              if (n == 0)
                                                      break;
                                              if (__sx_can_read(td, x, false))
                                                      break;
                                      }
      #ifdef KDTRACE_HOOKS
                                      lda.spin_cnt += i;
      #endif
                                      KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
                                          "running");
                                      if (i < asx_loops)
                                              continue;
                              }
                      }
      #endif
      
                      /*
                       * Some other thread already has an exclusive lock, so
                       * start the process of blocking.
                       */
                      sleepq_lock(&sx->lock_object);
                      x = SX_READ_VALUE(sx);
      retry_sleepq:
                      if (((x & SX_LOCK_WRITE_SPINNER) && SX_SHARERS(x) == 0) ||
                          __sx_can_read(td, x, false)) {
                              sleepq_release(&sx->lock_object);
                              continue;
                      }
      
      #ifdef ADAPTIVE_SX
                      /*
                       * If the owner is running on another CPU, spin until
                       * the owner stops running or the state of the lock
                       * changes.
                       */
                      if (!(x & SX_LOCK_SHARED)) {
                              owner = (struct thread *)SX_OWNER(x);
   23                         if (TD_IS_RUNNING(owner)) {
                                      sleepq_release(&sx->lock_object);
                                      x = SX_READ_VALUE(sx);
                                      continue;
                              }
                      }
      #endif
      
                      /*
                       * Try to set the SX_LOCK_SHARED_WAITERS flag.  If we
                       * fail to set it drop the sleep queue lock and loop
                       * back.
                       */
                      if (!(x & SX_LOCK_SHARED_WAITERS)) {
   23                         if (!atomic_fcmpset_ptr(&sx->sx_lock, &x,
                                  x | SX_LOCK_SHARED_WAITERS))
                                      goto retry_sleepq;
                              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                      CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
                                          __func__, sx);
                      }
      
                      /*
                       * Since we have been unable to acquire the shared lock,
                       * we have to sleep.
                       */
                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                              CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
                                  __func__, sx);
      
      #ifdef KDTRACE_HOOKS
                      sleep_time -= lockstat_nsecs(&sx->lock_object);
      #endif
                      sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
                          SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
                          SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
                      if (!(opts & SX_INTERRUPTIBLE))
   23                         sleepq_wait(&sx->lock_object, 0);
                      else
                              error = sleepq_wait_sig(&sx->lock_object, 0);
      #ifdef KDTRACE_HOOKS
                      sleep_time += lockstat_nsecs(&sx->lock_object);
                      sleep_cnt++;
      #endif
                      if (error) {
                              if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                      CTR2(KTR_LOCK,
                              "%s: interruptible sleep by %p suspended by signal",
                                          __func__, sx);
                              break;
                      }
                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                              CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
                                  __func__, sx);
   23                 x = SX_READ_VALUE(sx);
              }
      #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
  456         if (__predict_true(!extra_work))
                      return (error);
      #endif
      #ifdef KDTRACE_HOOKS
              all_time += lockstat_nsecs(&sx->lock_object);
              if (sleep_time)
                      LOCKSTAT_RECORD4(sx__block, sx, sleep_time,
                          LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0,
                          (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
              if (lda.spin_cnt > sleep_cnt)
                      LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time,
                          LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0,
                          (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
      out_lockstat:
      #endif
              if (error == 0) {
                      LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx,
                          contested, waittime, file, line, LOCKSTAT_READER);
              }
              GIANT_RESTORE();
              return (error);
      }
      
      int
      _sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF)
 10582 {
              struct thread *td;
              uintptr_t x;
              int error;
      
 10582         KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
                  !TD_IS_IDLETHREAD(curthread),
                  ("sx_slock() by idle thread %p on sx %s @ %s:%d",
                  curthread, sx->lock_object.lo_name, file, line));
              KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
                  ("sx_slock() of destroyed sx @ %s:%d", file, line));
              WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL);
      
              error = 0;
              td = curthread;
              x = SX_READ_VALUE(sx);
              if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__acquire) ||
                  !__sx_slock_try(sx, td, &x, true LOCK_FILE_LINE_ARG)))
                      error = _sx_slock_hard(sx, opts, x LOCK_FILE_LINE_ARG);
              else
                      lock_profile_obtain_lock_success(&sx->lock_object, 0, 0,
                          file, line);
  456         if (error == 0) {
                      LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
                      WITNESS_LOCK(&sx->lock_object, 0, file, line);
                      TD_LOCKS_INC(curthread);
              }
              return (error);
      }
      
      int
      _sx_slock(struct sx *sx, int opts, const char *file, int line)
 10565 {
      
              return (_sx_slock_int(sx, opts LOCK_FILE_LINE_ARG));
      }
      
      static bool __always_inline
      _sx_sunlock_try(struct sx *sx, struct thread *td, uintptr_t *xp)
      {
      
              for (;;) {
    9                 if (SX_SHARERS(*xp) > 1 || !(*xp & SX_LOCK_WAITERS)) {
   13                         if (atomic_fcmpset_rel_ptr(&sx->sx_lock, xp,
                                  *xp - SX_ONE_SHARER)) {
                                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                                              CTR4(KTR_LOCK,
                                                  "%s: %p succeeded %p -> %p",
                                                  __func__, sx, (void *)*xp,
                                                  (void *)(*xp - SX_ONE_SHARER));
 10215                                 td->td_sx_slocks--;
                                      return (true);
                              }
                              continue;
                      }
                      break;
              }
              return (false);
      }
      
      static void __noinline
      _sx_sunlock_hard(struct sx *sx, struct thread *td, uintptr_t x
          LOCK_FILE_LINE_ARG_DEF)
    9 {
              int wakeup_swapper = 0;
              uintptr_t setx, queue;
      
              if (SCHEDULER_STOPPED())
                      return;
      
              if (_sx_sunlock_try(sx, td, &x))
                      goto out_lockstat;
      
              sleepq_lock(&sx->lock_object);
              x = SX_READ_VALUE(sx);
              for (;;) {
                      if (_sx_sunlock_try(sx, td, &x))
                              break;
      
                      /*
                       * Wake up semantic here is quite simple:
                       * Just wake up all the exclusive waiters.
                       * Note that the state of the lock could have changed,
                       * so if it fails loop back and retry.
                       */
                      setx = SX_LOCK_UNLOCKED;
                      queue = SQ_SHARED_QUEUE;
                      if (x & SX_LOCK_EXCLUSIVE_WAITERS) {
                              setx |= (x & SX_LOCK_SHARED_WAITERS);
                              queue = SQ_EXCLUSIVE_QUEUE;
                      }
                      setx |= (x & SX_LOCK_WRITE_SPINNER);
                      if (!atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, setx))
                              continue;
                      if (LOCK_LOG_TEST(&sx->lock_object, 0))
                              CTR2(KTR_LOCK, "%s: %p waking up all thread on"
                                  "exclusive queue", __func__, sx);
                      wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
                          0, queue);
                      td->td_sx_slocks--;
                      break;
              }
              sleepq_release(&sx->lock_object);
    9         if (wakeup_swapper)
                      kick_proc0();
      out_lockstat:
    9         LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_READER);
      }
      
      void
      _sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 10226 {
              struct thread *td;
              uintptr_t x;
      
              KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
                  ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
              _sx_assert(sx, SA_SLOCKED, file, line);
              WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
              LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
      
              td = curthread;
              x = SX_READ_VALUE(sx);
              if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__release) ||
                  !_sx_sunlock_try(sx, td, &x)))
                      _sx_sunlock_hard(sx, td, x LOCK_FILE_LINE_ARG);
              else
                      lock_profile_release_lock(&sx->lock_object);
      
 10221         TD_LOCKS_DEC(curthread);
      }
      
      void
      _sx_sunlock(struct sx *sx, const char *file, int line)
 10252 {
      
              _sx_sunlock_int(sx LOCK_FILE_LINE_ARG);
      }
      
      #ifdef INVARIANT_SUPPORT
      #ifndef INVARIANTS
      #undef        _sx_assert
      #endif
      
      /*
       * In the non-WITNESS case, sx_assert() can only detect that at least
       * *some* thread owns an slock, but it cannot guarantee that *this*
       * thread owns an slock.
       */
      void
      _sx_assert(const struct sx *sx, int what, const char *file, int line)
 9077 {
      #ifndef WITNESS
              int slocked = 0;
      #endif
      
              if (SCHEDULER_STOPPED())
                      return;
 9077         switch (what) {
              case SA_SLOCKED:
              case SA_SLOCKED | SA_NOTRECURSED:
              case SA_SLOCKED | SA_RECURSED:
      #ifndef WITNESS
                      slocked = 1;
                      /* FALLTHROUGH */
      #endif
              case SA_LOCKED:
              case SA_LOCKED | SA_NOTRECURSED:
              case SA_LOCKED | SA_RECURSED:
      #ifdef WITNESS
 10955                 witness_assert(&sx->lock_object, what, file, line);
      #else
                      /*
                       * If some other thread has an exclusive lock or we
                       * have one and are asserting a shared lock, fail.
                       * Also, if no one has a lock at all, fail.
                       */
                      if (sx->sx_lock == SX_LOCK_UNLOCKED ||
                          (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
                          sx_xholder(sx) != curthread)))
                              panic("Lock %s not %slocked @ %s:%d\n",
                                  sx->lock_object.lo_name, slocked ? "share " : "",
                                  file, line);
      
                      if (!(sx->sx_lock & SX_LOCK_SHARED)) {
                              if (sx_recursed(sx)) {
                                      if (what & SA_NOTRECURSED)
                                              panic("Lock %s recursed @ %s:%d\n",
                                                  sx->lock_object.lo_name, file,
                                                  line);
                              } else if (what & SA_RECURSED)
                                      panic("Lock %s not recursed @ %s:%d\n",
                                          sx->lock_object.lo_name, file, line);
                      }
      #endif
                      break;
              case SA_XLOCKED:
              case SA_XLOCKED | SA_NOTRECURSED:
              case SA_XLOCKED | SA_RECURSED:
 14106                 if (sx_xholder(sx) != curthread)
                              panic("Lock %s not exclusively locked @ %s:%d\n",
                                  sx->lock_object.lo_name, file, line);
                      if (sx_recursed(sx)) {
                              if (what & SA_NOTRECURSED)
                                      panic("Lock %s recursed @ %s:%d\n",
                                          sx->lock_object.lo_name, file, line);
 7627                 } else if (what & SA_RECURSED)
                              panic("Lock %s not recursed @ %s:%d\n",
                                  sx->lock_object.lo_name, file, line);
                      break;
              case SA_UNLOCKED:
      #ifdef WITNESS
 1477                 witness_assert(&sx->lock_object, what, file, line);
      #else
                      /*
                       * If we hold an exclusve lock fail.  We can't
                       * reliably check to see if we hold a shared lock or
                       * not.
                       */
                      if (sx_xholder(sx) == curthread)
                              panic("Lock %s exclusively locked @ %s:%d\n",
                                  sx->lock_object.lo_name, file, line);
      #endif
                      break;
              default:
                      panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
                          line);
              }
      }
      #endif        /* INVARIANT_SUPPORT */
      
      #ifdef DDB
      static void
      db_show_sx(const struct lock_object *lock)
      {
              struct thread *td;
              const struct sx *sx;
      
              sx = (const struct sx *)lock;
      
              db_printf(" state: ");
              if (sx->sx_lock == SX_LOCK_UNLOCKED)
                      db_printf("UNLOCKED\n");
              else if (sx->sx_lock == SX_LOCK_DESTROYED) {
                      db_printf("DESTROYED\n");
                      return;
              } else if (sx->sx_lock & SX_LOCK_SHARED)
                      db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
              else {
                      td = sx_xholder(sx);
                      db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
                          td->td_tid, td->td_proc->p_pid, td->td_name);
                      if (sx_recursed(sx))
                              db_printf(" recursed: %d\n", sx->sx_recurse);
              }
      
              db_printf(" waiters: ");
              switch(sx->sx_lock &
                  (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
              case SX_LOCK_SHARED_WAITERS:
                      db_printf("shared\n");
                      break;
              case SX_LOCK_EXCLUSIVE_WAITERS:
                      db_printf("exclusive\n");
                      break;
              case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
                      db_printf("exclusive and shared\n");
                      break;
              default:
                      db_printf("none\n");
              }
      }
      
      /*
       * Check to see if a thread that is blocked on a sleep queue is actually
       * blocked on an sx lock.  If so, output some details and return true.
       * If the lock has an exclusive owner, return that in *ownerp.
       */
      int
      sx_chain(struct thread *td, struct thread **ownerp)
      {
              const struct sx *sx;
      
              /*
               * Check to see if this thread is blocked on an sx lock.
               * First, we check the lock class.  If that is ok, then we
               * compare the lock name against the wait message.
               */
              sx = td->td_wchan;
              if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
                  sx->lock_object.lo_name != td->td_wmesg)
                      return (0);
      
              /* We think we have an sx lock, so output some details. */
              db_printf("blocked on sx \"%s\" ", td->td_wmesg);
              *ownerp = sx_xholder(sx);
              if (sx->sx_lock & SX_LOCK_SHARED)
                      db_printf("SLOCK (count %ju)\n",
                          (uintmax_t)SX_SHARERS(sx->sx_lock));
              else
                      db_printf("XLOCK\n");
              return (1);
      }
      #endif
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/counter.h>
      #include <sys/epoch.h>
      #include <sys/gtaskqueue.h>
      #include <sys/kernel.h>
      #include <sys/limits.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mutex.h>
      #include <sys/pcpu.h>
      #include <sys/proc.h>
      #include <sys/sched.h>
      #include <sys/sx.h>
      #include <sys/smp.h>
      #include <sys/sysctl.h>
      #include <sys/turnstile.h>
      #ifdef EPOCH_TRACE
      #include <machine/stdarg.h>
      #include <sys/stack.h>
      #include <sys/tree.h>
      #endif
      #include <vm/vm.h>
      #include <vm/vm_extern.h>
      #include <vm/vm_kern.h>
      #include <vm/uma.h>
      
      #include <ck_epoch.h>
      
      #ifdef __amd64__
      #define EPOCH_ALIGN CACHE_LINE_SIZE*2
      #else
      #define EPOCH_ALIGN CACHE_LINE_SIZE
      #endif
      
      TAILQ_HEAD (epoch_tdlist, epoch_tracker);
      typedef struct epoch_record {
              ck_epoch_record_t er_record;
              struct epoch_context er_drain_ctx;
              struct epoch *er_parent;
              volatile struct epoch_tdlist er_tdlist;
              volatile uint32_t er_gen;
              uint32_t er_cpuid;
      } __aligned(EPOCH_ALIGN)     *epoch_record_t;
      
      struct epoch {
              struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
              epoch_record_t e_pcpu_record;
              int        e_in_use;
              int        e_flags;
              struct sx e_drain_sx;
              struct mtx e_drain_mtx;
              volatile int e_drain_count;
              const char *e_name;
      };
      
      /* arbitrary --- needs benchmarking */
      #define MAX_ADAPTIVE_SPIN 100
      #define MAX_EPOCHS 64
      
      CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
      SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "epoch information");
      SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "epoch stats");
      
      /* Stats. */
      static counter_u64_t block_count;
      
      SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
          &block_count, "# of times a thread was in an epoch when epoch_wait was called");
      static counter_u64_t migrate_count;
      
      SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
          &migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
      static counter_u64_t turnstile_count;
      
      SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
          &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
      static counter_u64_t switch_count;
      
      SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
          &switch_count, "# of times a thread voluntarily context switched in epoch_wait");
      static counter_u64_t epoch_call_count;
      
      SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW,
          &epoch_call_count, "# of times a callback was deferred");
      static counter_u64_t epoch_call_task_count;
      
      SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW,
          &epoch_call_task_count, "# of times a callback task was run");
      
      TAILQ_HEAD (threadlist, thread);
      
      CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
          ck_epoch_entry_container)
      
      static struct epoch epoch_array[MAX_EPOCHS];
      
      DPCPU_DEFINE(struct grouptask, epoch_cb_task);
      DPCPU_DEFINE(int, epoch_cb_count);
      
      static __read_mostly int inited;
      __read_mostly epoch_t global_epoch;
      __read_mostly epoch_t global_epoch_preempt;
      
      static void epoch_call_task(void *context __unused);
      static         uma_zone_t pcpu_zone_record;
      
      static struct sx epoch_sx;
      
      #define        EPOCH_LOCK() sx_xlock(&epoch_sx)
      #define        EPOCH_UNLOCK() sx_xunlock(&epoch_sx)
      
      #ifdef EPOCH_TRACE
      struct stackentry {
              RB_ENTRY(stackentry) se_node;
              struct stack se_stack;
      };
      
      static int
      stackentry_compare(struct stackentry *a, struct stackentry *b)
      {
      
              if (a->se_stack.depth > b->se_stack.depth)
                      return (1);
              if (a->se_stack.depth < b->se_stack.depth)
                      return (-1);
              for (int i = 0; i < a->se_stack.depth; i++) {
                      if (a->se_stack.pcs[i] > b->se_stack.pcs[i])
                              return (1);
                      if (a->se_stack.pcs[i] < b->se_stack.pcs[i])
                              return (-1);
              }
      
              return (0);
      }
      
      RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks);
      RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare);
      
      static struct mtx epoch_stacks_lock;
      MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF);
      
      static bool epoch_trace_stack_print = true;
      SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN,
          &epoch_trace_stack_print, 0, "Print stack traces on epoch reports");
      
      static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2);
      static inline void
      epoch_trace_report(const char *fmt, ...)
      {
              va_list ap;
              struct stackentry se, *new;
      
              stack_zero(&se.se_stack);        /* XXX: is it really needed? */
              stack_save(&se.se_stack);
      
              /* Tree is never reduced - go lockless. */
              if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL)
                      return;
      
              new = malloc(sizeof(*new), M_STACK, M_NOWAIT);
              if (new != NULL) {
                      bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack));
      
                      mtx_lock(&epoch_stacks_lock);
                      new = RB_INSERT(stacktree, &epoch_stacks, new);
                      mtx_unlock(&epoch_stacks_lock);
                      if (new != NULL)
                              free(new, M_STACK);
              }
      
              va_start(ap, fmt);
              (void)vprintf(fmt, ap);
              va_end(ap);
              if (epoch_trace_stack_print)
                      stack_print_ddb(&se.se_stack);
      }
      
      static inline void
      epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et,
          const char *file, int line)
      {
              epoch_tracker_t iet;
      
              SLIST_FOREACH(iet, &td->td_epochs, et_tlink)
                      if (iet->et_epoch == epoch)
                              epoch_trace_report("Recursively entering epoch %s "
                                  "at %s:%d, previously entered at %s:%d\n",
                                  epoch->e_name, file, line,
                                  iet->et_file, iet->et_line);
              et->et_epoch = epoch;
              et->et_file = file;
              et->et_line = line;
              SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink);
      }
      
      static inline void
      epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et,
          const char *file, int line)
      {
      
              if (SLIST_FIRST(&td->td_epochs) != et) {
                      epoch_trace_report("Exiting epoch %s in a not nested order "
                          "at %s:%d. Most recently entered %s at %s:%d\n",
                          epoch->e_name,
                          file, line,
                          SLIST_FIRST(&td->td_epochs)->et_epoch->e_name,
                          SLIST_FIRST(&td->td_epochs)->et_file,
                          SLIST_FIRST(&td->td_epochs)->et_line);
                      /* This will panic if et is not anywhere on td_epochs. */
                      SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink);
              } else
                      SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink);
      }
      
      /* Used by assertions that check thread state before going to sleep. */
      void
      epoch_trace_list(struct thread *td)
      {
              epoch_tracker_t iet;
      
              SLIST_FOREACH(iet, &td->td_epochs, et_tlink)
                      printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name,
                          iet->et_file, iet->et_line);
      }
      #endif /* EPOCH_TRACE */
      
      static void
      epoch_init(void *arg __unused)
      {
              int cpu;
      
              block_count = counter_u64_alloc(M_WAITOK);
              migrate_count = counter_u64_alloc(M_WAITOK);
              turnstile_count = counter_u64_alloc(M_WAITOK);
              switch_count = counter_u64_alloc(M_WAITOK);
              epoch_call_count = counter_u64_alloc(M_WAITOK);
              epoch_call_task_count = counter_u64_alloc(M_WAITOK);
      
              pcpu_zone_record = uma_zcreate("epoch_record pcpu",
                  sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
                  UMA_ALIGN_PTR, UMA_ZONE_PCPU);
              CPU_FOREACH(cpu) {
                      GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0,
                          epoch_call_task, NULL);
                      taskqgroup_attach_cpu(qgroup_softirq,
                          DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL,
                          "epoch call task");
              }
      #ifdef EPOCH_TRACE
              SLIST_INIT(&thread0.td_epochs);
      #endif
              sx_init(&epoch_sx, "epoch-sx");
              inited = 1;
              global_epoch = epoch_alloc("Global", 0);
              global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT);
      }
      SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL);
      
      #if !defined(EARLY_AP_STARTUP)
      static void
      epoch_init_smp(void *dummy __unused)
      {
              inited = 2;
      }
      SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL);
      #endif
      
      static void
      epoch_ctor(epoch_t epoch)
      {
              epoch_record_t er;
              int cpu;
      
              epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK);
              CPU_FOREACH(cpu) {
                      er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
                      bzero(er, sizeof(*er));
                      ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
                      TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
                      er->er_cpuid = cpu;
                      er->er_parent = epoch;
              }
      }
      
      static void
      epoch_adjust_prio(struct thread *td, u_char prio)
      {
      
              thread_lock(td);
              sched_prio(td, prio);
              thread_unlock(td);
      }
      
      epoch_t
      epoch_alloc(const char *name, int flags)
      {
              epoch_t epoch;
              int i;
      
              MPASS(name != NULL);
      
              if (__predict_false(!inited))
                      panic("%s called too early in boot", __func__);
      
              EPOCH_LOCK();
      
              /*
               * Find a free index in the epoch array. If no free index is
               * found, try to use the index after the last one.
               */
              for (i = 0;; i++) {
                      /*
                       * If too many epochs are currently allocated,
                       * return NULL.
                       */
                      if (i == MAX_EPOCHS) {
                              epoch = NULL;
                              goto done;
                      }
                      if (epoch_array[i].e_in_use == 0)
                              break;
              }
      
              epoch = epoch_array + i;
              ck_epoch_init(&epoch->e_epoch);
              epoch_ctor(epoch);
              epoch->e_flags = flags;
              epoch->e_name = name;
              sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
              mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
      
              /*
               * Set e_in_use last, because when this field is set the
               * epoch_call_task() function will start scanning this epoch
               * structure.
               */
              atomic_store_rel_int(&epoch->e_in_use, 1);
      done:
              EPOCH_UNLOCK();
              return (epoch);
      }
      
      void
      epoch_free(epoch_t epoch)
      {
      
              EPOCH_LOCK();
      
              MPASS(epoch->e_in_use != 0);
      
              epoch_drain_callbacks(epoch);
      
              atomic_store_rel_int(&epoch->e_in_use, 0);
              /*
               * Make sure the epoch_call_task() function see e_in_use equal
               * to zero, by calling epoch_wait() on the global_epoch:
               */
              epoch_wait(global_epoch);
              uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
              mtx_destroy(&epoch->e_drain_mtx);
              sx_destroy(&epoch->e_drain_sx);
              memset(epoch, 0, sizeof(*epoch));
      
              EPOCH_UNLOCK();
      }
      
      static epoch_record_t
      epoch_currecord(epoch_t epoch)
      {
      
              return (zpcpu_get(epoch->e_pcpu_record));
      }
      
      #define INIT_CHECK(epoch)                                        \
              do {                                                        \
                      if (__predict_false((epoch) == NULL))                \
                              return;                                        \
              } while (0)
      
      void
      _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
 10352 {
              struct epoch_record *er;
              struct thread *td;
      
              MPASS(cold || epoch != NULL);
              MPASS(epoch->e_flags & EPOCH_PREEMPT);
              td = curthread;
              MPASS((vm_offset_t)et >= td->td_kstack &&
                  (vm_offset_t)et + sizeof(struct epoch_tracker) <=
                  td->td_kstack + td->td_kstack_pages * PAGE_SIZE);
      
              INIT_CHECK(epoch);
      #ifdef EPOCH_TRACE
              epoch_trace_enter(td, epoch, et, file, line);
      #endif
              et->et_td = td;
              THREAD_NO_SLEEPING();
              critical_enter();
              sched_pin();
              td->td_pre_epoch_prio = td->td_priority;
              er = epoch_currecord(epoch);
              TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
 10344         ck_epoch_begin(&er->er_record, &et->et_section);
              critical_exit();
      }
      
      void
      epoch_enter(epoch_t epoch)
      {
              epoch_record_t er;
      
              MPASS(cold || epoch != NULL);
              INIT_CHECK(epoch);
              critical_enter();
              er = epoch_currecord(epoch);
              ck_epoch_begin(&er->er_record, NULL);
      }
      
      void
      _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
 9468 {
              struct epoch_record *er;
              struct thread *td;
      
              INIT_CHECK(epoch);
              td = curthread;
              critical_enter();
              sched_unpin();
              THREAD_SLEEPING_OK();
              er = epoch_currecord(epoch);
              MPASS(epoch->e_flags & EPOCH_PREEMPT);
              MPASS(et != NULL);
              MPASS(et->et_td == td);
      #ifdef INVARIANTS
              et->et_td = (void*)0xDEADBEEF;
      #endif
              ck_epoch_end(&er->er_record, &et->et_section);
 9465         TAILQ_REMOVE(&er->er_tdlist, et, et_link);
              er->er_gen++;
 9414         if (__predict_false(td->td_pre_epoch_prio != td->td_priority))
  413                 epoch_adjust_prio(td, td->td_pre_epoch_prio);
              critical_exit();
      #ifdef EPOCH_TRACE
              epoch_trace_exit(td, epoch, et, file, line);
      #endif
      }
      
      void
      epoch_exit(epoch_t epoch)
      {
              epoch_record_t er;
      
              INIT_CHECK(epoch);
              er = epoch_currecord(epoch);
              ck_epoch_end(&er->er_record, NULL);
              critical_exit();
      }
      
      /*
       * epoch_block_handler_preempt() is a callback from the CK code when another
       * thread is currently in an epoch section.
       */
      static void
      epoch_block_handler_preempt(struct ck_epoch *global __unused,
          ck_epoch_record_t *cr, void *arg __unused)
      {
              epoch_record_t record;
              struct thread *td, *owner, *curwaittd;
              struct epoch_tracker *tdwait;
              struct turnstile *ts;
              struct lock_object *lock;
              int spincount, gen;
              int locksheld __unused;
      
              record = __containerof(cr, struct epoch_record, er_record);
              td = curthread;
              locksheld = td->td_locks;
              spincount = 0;
              counter_u64_add(block_count, 1);
              /*
               * We lost a race and there's no longer any threads
               * on the CPU in an epoch section.
               */
              if (TAILQ_EMPTY(&record->er_tdlist))
                      return;
      
              if (record->er_cpuid != curcpu) {
                      /*
                       * If the head of the list is running, we can wait for it
                       * to remove itself from the list and thus save us the
                       * overhead of a migration
                       */
                      gen = record->er_gen;
                      thread_unlock(td);
                      /*
                       * We can't actually check if the waiting thread is running
                       * so we simply poll for it to exit before giving up and
                       * migrating.
                       */
                      do {
                              cpu_spinwait();
                      } while (!TAILQ_EMPTY(&record->er_tdlist) &&
                                       gen == record->er_gen &&
                                       spincount++ < MAX_ADAPTIVE_SPIN);
                      thread_lock(td);
                      /*
                       * If the generation has changed we can poll again
                       * otherwise we need to migrate.
                       */
                      if (gen != record->er_gen)
                              return;
                      /*
                       * Being on the same CPU as that of the record on which
                       * we need to wait allows us access to the thread
                       * list associated with that CPU. We can then examine the
                       * oldest thread in the queue and wait on its turnstile
                       * until it resumes and so on until a grace period
                       * elapses.
                       *
                       */
                      counter_u64_add(migrate_count, 1);
                      sched_bind(td, record->er_cpuid);
                      /*
                       * At this point we need to return to the ck code
                       * to scan to see if a grace period has elapsed.
                       * We can't move on to check the thread list, because
                       * in the meantime new threads may have arrived that
                       * in fact belong to a different epoch.
                       */
                      return;
              }
              /*
               * Try to find a thread in an epoch section on this CPU
               * waiting on a turnstile. Otherwise find the lowest
               * priority thread (highest prio value) and drop our priority
               * to match to allow it to run.
               */
              TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
                      /*
                       * Propagate our priority to any other waiters to prevent us
                       * from starving them. They will have their original priority
                       * restore on exit from epoch_wait().
                       */
                      curwaittd = tdwait->et_td;
                      if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) {
                              critical_enter();
                              thread_unlock(td);
                              thread_lock(curwaittd);
                              sched_prio(curwaittd, td->td_priority);
                              thread_unlock(curwaittd);
                              thread_lock(td);
                              critical_exit();
                      }
                      if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
                          ((ts = curwaittd->td_blocked) != NULL)) {
                              /*
                               * We unlock td to allow turnstile_wait to reacquire
                               * the thread lock. Before unlocking it we enter a
                               * critical section to prevent preemption after we
                               * reenable interrupts by dropping the thread lock in
                               * order to prevent curwaittd from getting to run.
                               */
                              critical_enter();
                              thread_unlock(td);
      
                              if (turnstile_lock(ts, &lock, &owner)) {
                                      if (ts == curwaittd->td_blocked) {
                                              MPASS(TD_IS_INHIBITED(curwaittd) &&
                                                  TD_ON_LOCK(curwaittd));
                                              critical_exit();
                                              turnstile_wait(ts, owner,
                                                  curwaittd->td_tsqueue);
                                              counter_u64_add(turnstile_count, 1);
                                              thread_lock(td);
                                              return;
                                      }
                                      turnstile_unlock(ts, lock);
                              }
                              thread_lock(td);
                              critical_exit();
                              KASSERT(td->td_locks == locksheld,
                                  ("%d extra locks held", td->td_locks - locksheld));
                      }
              }
              /*
               * We didn't find any threads actually blocked on a lock
               * so we have nothing to do except context switch away.
               */
              counter_u64_add(switch_count, 1);
              mi_switch(SW_VOL | SWT_RELINQUISH);
              /*
               * It is important the thread lock is dropped while yielding
               * to allow other threads to acquire the lock pointed to by
               * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the
               * thread lock before returning. Else a deadlock like
               * situation might happen.
               */
              thread_lock(td);
      }
      
      void
      epoch_wait_preempt(epoch_t epoch)
      {
              struct thread *td;
              int was_bound;
              int old_cpu;
              int old_pinned;
              u_char old_prio;
              int locks __unused;
      
              MPASS(cold || epoch != NULL);
              INIT_CHECK(epoch);
              td = curthread;
      #ifdef INVARIANTS
              locks = curthread->td_locks;
              MPASS(epoch->e_flags & EPOCH_PREEMPT);
              if ((epoch->e_flags & EPOCH_LOCKED) == 0)
                      WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
                          "epoch_wait() can be long running");
              KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle "
                  "of an epoch section of the same epoch"));
      #endif
              DROP_GIANT();
              thread_lock(td);
      
              old_cpu = PCPU_GET(cpuid);
              old_pinned = td->td_pinned;
              old_prio = td->td_priority;
              was_bound = sched_is_bound(td);
              sched_unbind(td);
              td->td_pinned = 0;
              sched_bind(td, old_cpu);
      
              ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
                  NULL);
      
              /* restore CPU binding, if any */
              if (was_bound != 0) {
                      sched_bind(td, old_cpu);
              } else {
                      /* get thread back to initial CPU, if any */
                      if (old_pinned != 0)
                              sched_bind(td, old_cpu);
                      sched_unbind(td);
              }
              /* restore pinned after bind */
              td->td_pinned = old_pinned;
      
              /* restore thread priority */
              sched_prio(td, old_prio);
              thread_unlock(td);
              PICKUP_GIANT();
              KASSERT(td->td_locks == locks,
                  ("%d residual locks held", td->td_locks - locks));
      }
      
      static void
      epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
          void *arg __unused)
      {
              cpu_spinwait();
      }
      
      void
      epoch_wait(epoch_t epoch)
      {
      
              MPASS(cold || epoch != NULL);
              INIT_CHECK(epoch);
              MPASS(epoch->e_flags == 0);
              critical_enter();
              ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
              critical_exit();
      }
      
      void
      epoch_call(epoch_t epoch, epoch_callback_t callback, epoch_context_t ctx)
  135 {
              epoch_record_t er;
              ck_epoch_entry_t *cb;
      
              cb = (void *)ctx;
      
              MPASS(callback);
              /* too early in boot to have epoch set up */
              if (__predict_false(epoch == NULL))
                      goto boottime;
      #if !defined(EARLY_AP_STARTUP)
              if (__predict_false(inited < 2))
                      goto boottime;
      #endif
      
              critical_enter();
              *DPCPU_PTR(epoch_cb_count) += 1;
              er = epoch_currecord(epoch);
              ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback);
              critical_exit();
              return;
      boottime:
              callback(ctx);
      }
      
      static void
      epoch_call_task(void *arg __unused)
      {
              ck_stack_entry_t *cursor, *head, *next;
              ck_epoch_record_t *record;
              epoch_record_t er;
              epoch_t epoch;
              ck_stack_t cb_stack;
              int i, npending, total;
      
              ck_stack_init(&cb_stack);
              critical_enter();
              epoch_enter(global_epoch);
              for (total = i = 0; i != MAX_EPOCHS; i++) {
                      epoch = epoch_array + i;
                      if (__predict_false(
                          atomic_load_acq_int(&epoch->e_in_use) == 0))
                              continue;
                      er = epoch_currecord(epoch);
                      record = &er->er_record;
                      if ((npending = record->n_pending) == 0)
                              continue;
                      ck_epoch_poll_deferred(record, &cb_stack);
                      total += npending - record->n_pending;
              }
              epoch_exit(global_epoch);
              *DPCPU_PTR(epoch_cb_count) -= total;
              critical_exit();
      
              counter_u64_add(epoch_call_count, total);
              counter_u64_add(epoch_call_task_count, 1);
      
              head = ck_stack_batch_pop_npsc(&cb_stack);
              for (cursor = head; cursor != NULL; cursor = next) {
                      struct ck_epoch_entry *entry =
                          ck_epoch_entry_container(cursor);
      
                      next = CK_STACK_NEXT(cursor);
                      entry->function(entry);
              }
      }
      
      int
      in_epoch_verbose(epoch_t epoch, int dump_onfail)
 7065 {
              struct epoch_tracker *tdwait;
              struct thread *td;
              epoch_record_t er;
      
              td = curthread;
  755         if (THREAD_CAN_SLEEP())
                      return (0);
              if (__predict_false((epoch) == NULL))
                      return (0);
              critical_enter();
              er = epoch_currecord(epoch);
   26         TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
                      if (tdwait->et_td == td) {
                              critical_exit();
                              return (1);
                      }
      #ifdef INVARIANTS
              if (dump_onfail) {
                      MPASS(td->td_pinned);
                      printf("cpu: %d id: %d\n", curcpu, td->td_tid);
                      TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
                              printf("td_tid: %d ", tdwait->et_td->td_tid);
                      printf("\n");
              }
      #endif
              critical_exit();
              return (0);
      }
      
      int
      in_epoch(epoch_t epoch)
 7065 {
              return (in_epoch_verbose(epoch, 0));
      }
      
      static void
      epoch_drain_cb(struct epoch_context *ctx)
      {
              struct epoch *epoch =
                  __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
      
              if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
                      mtx_lock(&epoch->e_drain_mtx);
                      wakeup(epoch);
                      mtx_unlock(&epoch->e_drain_mtx);
              }
      }
      
      void
      epoch_drain_callbacks(epoch_t epoch)
      {
              epoch_record_t er;
              struct thread *td;
              int was_bound;
              int old_pinned;
              int old_cpu;
              int cpu;
      
              WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
                  "epoch_drain_callbacks() may sleep!");
      
              /* too early in boot to have epoch set up */
              if (__predict_false(epoch == NULL))
                      return;
      #if !defined(EARLY_AP_STARTUP)
              if (__predict_false(inited < 2))
                      return;
      #endif
              DROP_GIANT();
      
              sx_xlock(&epoch->e_drain_sx);
              mtx_lock(&epoch->e_drain_mtx);
      
              td = curthread;
              thread_lock(td);
              old_cpu = PCPU_GET(cpuid);
              old_pinned = td->td_pinned;
              was_bound = sched_is_bound(td);
              sched_unbind(td);
              td->td_pinned = 0;
      
              CPU_FOREACH(cpu)
                      epoch->e_drain_count++;
              CPU_FOREACH(cpu) {
                      er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
                      sched_bind(td, cpu);
                      epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
              }
      
              /* restore CPU binding, if any */
              if (was_bound != 0) {
                      sched_bind(td, old_cpu);
              } else {
                      /* get thread back to initial CPU, if any */
                      if (old_pinned != 0)
                              sched_bind(td, old_cpu);
                      sched_unbind(td);
              }
              /* restore pinned after bind */
              td->td_pinned = old_pinned;
      
              thread_unlock(td);
      
              while (epoch->e_drain_count != 0)
                      msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
      
              mtx_unlock(&epoch->e_drain_mtx);
              sx_xunlock(&epoch->e_drain_sx);
      
              PICKUP_GIANT();
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice unmodified, this list of conditions, and the following
       *    disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Implements the virtqueue interface as basically described
       * in the original VirtIO paper.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/malloc.h>
      #include <sys/sglist.h>
      #include <vm/vm.h>
      #include <vm/pmap.h>
      
      #include <machine/cpu.h>
      #include <machine/bus.h>
      #include <machine/atomic.h>
      #include <machine/resource.h>
      #include <sys/bus.h>
      #include <sys/rman.h>
      
      #include <dev/virtio/virtio.h>
      #include <dev/virtio/virtqueue.h>
      #include <dev/virtio/virtio_ring.h>
      
      #include "virtio_bus_if.h"
      
      struct virtqueue {
              device_t                 vq_dev;
              char                         vq_name[VIRTQUEUE_MAX_NAME_SZ];
              uint16_t                 vq_queue_index;
              uint16_t                 vq_nentries;
              uint32_t                 vq_flags;
      #define        VIRTQUEUE_FLAG_INDIRECT         0x0001
      #define        VIRTQUEUE_FLAG_EVENT_IDX 0x0002
      
              int                         vq_alignment;
              int                         vq_ring_size;
              void                        *vq_ring_mem;
              int                         vq_max_indirect_size;
              int                         vq_indirect_mem_size;
              virtqueue_intr_t        *vq_intrhand;
              void                        *vq_intrhand_arg;
      
              struct vring                 vq_ring;
              uint16_t                 vq_free_cnt;
              uint16_t                 vq_queued_cnt;
              /*
               * Head of the free chain in the descriptor table. If
               * there are no free descriptors, this will be set to
               * VQ_RING_DESC_CHAIN_END.
               */
              uint16_t                 vq_desc_head_idx;
              /*
               * Last consumed descriptor in the used table,
               * trails vq_ring.used->idx.
               */
              uint16_t                 vq_used_cons_idx;
      
              struct vq_desc_extra {
                      void                  *cookie;
                      struct vring_desc *indirect;
                      vm_paddr_t           indirect_paddr;
                      uint16_t           ndescs;
              } vq_descx[0];
      };
      
      /*
       * The maximum virtqueue size is 2^15. Use that value as the end of
       * descriptor chain terminator since it will never be a valid index
       * in the descriptor table. This is used to verify we are correctly
       * handling vq_free_cnt.
       */
      #define VQ_RING_DESC_CHAIN_END 32768
      
      #define VQASSERT(_vq, _exp, _msg, ...)                                \
          KASSERT((_exp),("%s: %s - "_msg, __func__, (_vq)->vq_name,        \
              ##__VA_ARGS__))
      
      #define VQ_RING_ASSERT_VALID_IDX(_vq, _idx)                        \
          VQASSERT((_vq), (_idx) < (_vq)->vq_nentries,                \
              "invalid ring index: %d, max: %d", (_idx),                \
              (_vq)->vq_nentries)
      
      #define VQ_RING_ASSERT_CHAIN_TERM(_vq)                                \
          VQASSERT((_vq), (_vq)->vq_desc_head_idx ==                        \
              VQ_RING_DESC_CHAIN_END,        "full ring terminated "                \
              "incorrectly: head idx: %d", (_vq)->vq_desc_head_idx)
      
      static int        virtqueue_init_indirect(struct virtqueue *vq, int);
      static void        virtqueue_free_indirect(struct virtqueue *vq);
      static void        virtqueue_init_indirect_list(struct virtqueue *,
                          struct vring_desc *);
      
      static void        vq_ring_init(struct virtqueue *);
      static void        vq_ring_update_avail(struct virtqueue *, uint16_t);
      static uint16_t        vq_ring_enqueue_segments(struct virtqueue *,
                          struct vring_desc *, uint16_t, struct sglist *, int, int);
      static int        vq_ring_use_indirect(struct virtqueue *, int);
      static void        vq_ring_enqueue_indirect(struct virtqueue *, void *,
                          struct sglist *, int, int);
      static int        vq_ring_enable_interrupt(struct virtqueue *, uint16_t);
      static int        vq_ring_must_notify_host(struct virtqueue *);
      static void        vq_ring_notify_host(struct virtqueue *);
      static void        vq_ring_free_chain(struct virtqueue *, uint16_t);
      
      uint64_t
      virtqueue_filter_features(uint64_t features)
      {
              uint64_t mask;
      
              mask = (1 << VIRTIO_TRANSPORT_F_START) - 1;
              mask |= VIRTIO_RING_F_INDIRECT_DESC;
              mask |= VIRTIO_RING_F_EVENT_IDX;
              mask |= VIRTIO_F_VERSION_1;
      
              return (features & mask);
      }
      
      int
      virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, int align,
          vm_paddr_t highaddr, struct vq_alloc_info *info, struct virtqueue **vqp)
      {
              struct virtqueue *vq;
              int error;
      
              *vqp = NULL;
              error = 0;
      
              if (size == 0) {
                      device_printf(dev,
                          "virtqueue %d (%s) does not exist (size is zero)\n",
                          queue, info->vqai_name);
                      return (ENODEV);
              } else if (!powerof2(size)) {
                      device_printf(dev,
                          "virtqueue %d (%s) size is not a power of 2: %d\n",
                          queue, info->vqai_name, size);
                      return (ENXIO);
              } else if (info->vqai_maxindirsz > VIRTIO_MAX_INDIRECT) {
                      device_printf(dev, "virtqueue %d (%s) requested too many "
                          "indirect descriptors: %d, max %d\n",
                          queue, info->vqai_name, info->vqai_maxindirsz,
                          VIRTIO_MAX_INDIRECT);
                      return (EINVAL);
              }
      
              vq = malloc(sizeof(struct virtqueue) +
                  size * sizeof(struct vq_desc_extra), M_DEVBUF, M_NOWAIT | M_ZERO);
              if (vq == NULL) {
                      device_printf(dev, "cannot allocate virtqueue\n");
                      return (ENOMEM);
              }
      
              vq->vq_dev = dev;
              strlcpy(vq->vq_name, info->vqai_name, sizeof(vq->vq_name));
              vq->vq_queue_index = queue;
              vq->vq_alignment = align;
              vq->vq_nentries = size;
              vq->vq_free_cnt = size;
              vq->vq_intrhand = info->vqai_intr;
              vq->vq_intrhand_arg = info->vqai_intr_arg;
      
              if (VIRTIO_BUS_WITH_FEATURE(dev, VIRTIO_RING_F_EVENT_IDX) != 0)
                      vq->vq_flags |= VIRTQUEUE_FLAG_EVENT_IDX;
      
              if (info->vqai_maxindirsz > 1) {
                      error = virtqueue_init_indirect(vq, info->vqai_maxindirsz);
                      if (error)
                              goto fail;
              }
      
              vq->vq_ring_size = round_page(vring_size(size, align));
              vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
                  M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
              if (vq->vq_ring_mem == NULL) {
                      device_printf(dev,
                          "cannot allocate memory for virtqueue ring\n");
                      error = ENOMEM;
                      goto fail;
              }
      
              vq_ring_init(vq);
              virtqueue_disable_intr(vq);
      
              *vqp = vq;
      
      fail:
              if (error)
                      virtqueue_free(vq);
      
              return (error);
      }
      
      static int
      virtqueue_init_indirect(struct virtqueue *vq, int indirect_size)
      {
              device_t dev;
              struct vq_desc_extra *dxp;
              int i, size;
      
              dev = vq->vq_dev;
      
              if (VIRTIO_BUS_WITH_FEATURE(dev, VIRTIO_RING_F_INDIRECT_DESC) == 0) {
                      /*
                       * Indirect descriptors requested by the driver but not
                       * negotiated. Return zero to keep the initialization
                       * going: we'll run fine without.
                       */
                      if (bootverbose)
                              device_printf(dev, "virtqueue %d (%s) requested "
                                  "indirect descriptors but not negotiated\n",
                                  vq->vq_queue_index, vq->vq_name);
                      return (0);
              }
      
              size = indirect_size * sizeof(struct vring_desc);
              vq->vq_max_indirect_size = indirect_size;
              vq->vq_indirect_mem_size = size;
              vq->vq_flags |= VIRTQUEUE_FLAG_INDIRECT;
      
              for (i = 0; i < vq->vq_nentries; i++) {
                      dxp = &vq->vq_descx[i];
      
                      dxp->indirect = malloc(size, M_DEVBUF, M_NOWAIT);
                      if (dxp->indirect == NULL) {
                              device_printf(dev, "cannot allocate indirect list\n");
                              return (ENOMEM);
                      }
      
                      dxp->indirect_paddr = vtophys(dxp->indirect);
                      virtqueue_init_indirect_list(vq, dxp->indirect);
              }
      
              return (0);
      }
      
      static void
      virtqueue_free_indirect(struct virtqueue *vq)
      {
              struct vq_desc_extra *dxp;
              int i;
      
              for (i = 0; i < vq->vq_nentries; i++) {
                      dxp = &vq->vq_descx[i];
      
                      if (dxp->indirect == NULL)
                              break;
      
                      free(dxp->indirect, M_DEVBUF);
                      dxp->indirect = NULL;
                      dxp->indirect_paddr = 0;
              }
      
              vq->vq_flags &= ~VIRTQUEUE_FLAG_INDIRECT;
              vq->vq_indirect_mem_size = 0;
      }
      
      static void
      virtqueue_init_indirect_list(struct virtqueue *vq,
          struct vring_desc *indirect)
      {
              int i;
      
              bzero(indirect, vq->vq_indirect_mem_size);
      
              for (i = 0; i < vq->vq_max_indirect_size - 1; i++)
                      indirect[i].next = i + 1;
              indirect[i].next = VQ_RING_DESC_CHAIN_END;
      }
      
      int
      virtqueue_reinit(struct virtqueue *vq, uint16_t size)
      {
              struct vq_desc_extra *dxp;
              int i;
      
              if (vq->vq_nentries != size) {
                      device_printf(vq->vq_dev,
                          "%s: '%s' changed size; old=%hu, new=%hu\n",
                          __func__, vq->vq_name, vq->vq_nentries, size);
                      return (EINVAL);
              }
      
              /* Warn if the virtqueue was not properly cleaned up. */
              if (vq->vq_free_cnt != vq->vq_nentries) {
                      device_printf(vq->vq_dev,
                          "%s: warning '%s' virtqueue not empty, "
                          "leaking %d entries\n", __func__, vq->vq_name,
                          vq->vq_nentries - vq->vq_free_cnt);
              }
      
              vq->vq_desc_head_idx = 0;
              vq->vq_used_cons_idx = 0;
              vq->vq_queued_cnt = 0;
              vq->vq_free_cnt = vq->vq_nentries;
      
              /* To be safe, reset all our allocated memory. */
              bzero(vq->vq_ring_mem, vq->vq_ring_size);
              for (i = 0; i < vq->vq_nentries; i++) {
                      dxp = &vq->vq_descx[i];
                      dxp->cookie = NULL;
                      dxp->ndescs = 0;
                      if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT)
                              virtqueue_init_indirect_list(vq, dxp->indirect);
              }
      
              vq_ring_init(vq);
              virtqueue_disable_intr(vq);
      
              return (0);
      }
      
      void
      virtqueue_free(struct virtqueue *vq)
      {
      
              if (vq->vq_free_cnt != vq->vq_nentries) {
                      device_printf(vq->vq_dev, "%s: freeing non-empty virtqueue, "
                          "leaking %d entries\n", vq->vq_name,
                          vq->vq_nentries - vq->vq_free_cnt);
              }
      
              if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT)
                      virtqueue_free_indirect(vq);
      
              if (vq->vq_ring_mem != NULL) {
                      contigfree(vq->vq_ring_mem, vq->vq_ring_size, M_DEVBUF);
                      vq->vq_ring_size = 0;
                      vq->vq_ring_mem = NULL;
              }
      
              free(vq, M_DEVBUF);
      }
      
      vm_paddr_t
      virtqueue_paddr(struct virtqueue *vq)
      {
      
              return (vtophys(vq->vq_ring_mem));
      }
      
      vm_paddr_t
      virtqueue_desc_paddr(struct virtqueue *vq)
      {
      
              return (vtophys(vq->vq_ring.desc));
      }
      
      vm_paddr_t
      virtqueue_avail_paddr(struct virtqueue *vq)
      {
      
              return (vtophys(vq->vq_ring.avail));
      }
      
      vm_paddr_t
      virtqueue_used_paddr(struct virtqueue *vq)
      {
      
              return (vtophys(vq->vq_ring.used));
      }
      
      uint16_t
      virtqueue_index(struct virtqueue *vq)
      {
              return (vq->vq_queue_index);
      }
      
      int
      virtqueue_size(struct virtqueue *vq)
      {
      
              return (vq->vq_nentries);
      }
      
      int
      virtqueue_nfree(struct virtqueue *vq)
 1539 {
      
              return (vq->vq_free_cnt);
      }
      
      int
      virtqueue_empty(struct virtqueue *vq)
 1539 {
      
              return (vq->vq_nentries == vq->vq_free_cnt);
      }
      
      int
      virtqueue_full(struct virtqueue *vq)
 1539 {
      
              return (vq->vq_free_cnt == 0);
      }
      
      void
      virtqueue_notify(struct virtqueue *vq)
 3711 {
      
              /* Ensure updated avail->idx is visible to host. */
              mb();
      
 3711         if (vq_ring_must_notify_host(vq))
                      vq_ring_notify_host(vq);
              vq->vq_queued_cnt = 0;
      }
      
      int
      virtqueue_nused(struct virtqueue *vq)
      {
              uint16_t used_idx, nused;
      
              used_idx = vq->vq_ring.used->idx;
      
              nused = (uint16_t)(used_idx - vq->vq_used_cons_idx);
              VQASSERT(vq, nused <= vq->vq_nentries, "used more than available");
      
              return (nused);
      }
      
      int
      virtqueue_intr_filter(struct virtqueue *vq)
      {
      
              if (vq->vq_used_cons_idx == vq->vq_ring.used->idx)
                      return (0);
      
              virtqueue_disable_intr(vq);
      
              return (1);
      }
      
      void
      virtqueue_intr(struct virtqueue *vq)
      {
      
              vq->vq_intrhand(vq->vq_intrhand_arg);
      }
      
      int
      virtqueue_enable_intr(struct virtqueue *vq)
      {
      
              return (vq_ring_enable_interrupt(vq, 0));
      }
      
      int
      virtqueue_postpone_intr(struct virtqueue *vq, vq_postpone_t hint)
      {
              uint16_t ndesc, avail_idx;
      
              avail_idx = vq->vq_ring.avail->idx;
              ndesc = (uint16_t)(avail_idx - vq->vq_used_cons_idx);
      
              switch (hint) {
              case VQ_POSTPONE_SHORT:
                      ndesc = ndesc / 4;
                      break;
              case VQ_POSTPONE_LONG:
                      ndesc = (ndesc * 3) / 4;
                      break;
              case VQ_POSTPONE_EMPTIED:
                      break;
              }
      
              return (vq_ring_enable_interrupt(vq, ndesc));
      }
      
      /*
       * Note this is only considered a hint to the host.
       */
      void
      virtqueue_disable_intr(struct virtqueue *vq)
      {
      
              if (vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX) {
                      vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx -
                          vq->vq_nentries - 1;
              } else
                      vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
      }
      
      int
      virtqueue_enqueue(struct virtqueue *vq, void *cookie, struct sglist *sg,
          int readable, int writable)
 3711 {
              struct vq_desc_extra *dxp;
              int needed;
              uint16_t head_idx, idx;
      
              needed = readable + writable;
      
              VQASSERT(vq, cookie != NULL, "enqueuing with no cookie");
              VQASSERT(vq, needed == sg->sg_nseg,
                  "segment count mismatch, %d, %d", needed, sg->sg_nseg);
 3711         VQASSERT(vq,
                  needed <= vq->vq_nentries || needed <= vq->vq_max_indirect_size,
                  "too many segments to enqueue: %d, %d/%d", needed,
                  vq->vq_nentries, vq->vq_max_indirect_size);
      
              if (needed < 1)
                      return (EINVAL);
              if (vq->vq_free_cnt == 0)
                      return (ENOSPC);
      
              if (vq_ring_use_indirect(vq, needed)) {
                      vq_ring_enqueue_indirect(vq, cookie, sg, readable, writable);
                      return (0);
              } else if (vq->vq_free_cnt < needed)
                      return (EMSGSIZE);
      
              head_idx = vq->vq_desc_head_idx;
              VQ_RING_ASSERT_VALID_IDX(vq, head_idx);
              dxp = &vq->vq_descx[head_idx];
      
              VQASSERT(vq, dxp->cookie == NULL,
                  "cookie already exists for index %d", head_idx);
              dxp->cookie = cookie;
              dxp->ndescs = needed;
      
              idx = vq_ring_enqueue_segments(vq, vq->vq_ring.desc, head_idx,
                  sg, readable, writable);
      
              vq->vq_desc_head_idx = idx;
              vq->vq_free_cnt -= needed;
              if (vq->vq_free_cnt == 0)
                      VQ_RING_ASSERT_CHAIN_TERM(vq);
              else
 3711                 VQ_RING_ASSERT_VALID_IDX(vq, idx);
      
              vq_ring_update_avail(vq, head_idx);
      
              return (0);
      }
      
      void *
      virtqueue_dequeue(struct virtqueue *vq, uint32_t *len)
 1540 {
              struct vring_used_elem *uep;
              void *cookie;
              uint16_t used_idx, desc_idx;
      
 1539         if (vq->vq_used_cons_idx == vq->vq_ring.used->idx)
                      return (NULL);
      
              used_idx = vq->vq_used_cons_idx++ & (vq->vq_nentries - 1);
              uep = &vq->vq_ring.used->ring[used_idx];
      
              rmb();
              desc_idx = (uint16_t) uep->id;
  344         if (len != NULL)
                      *len = uep->len;
      
              vq_ring_free_chain(vq, desc_idx);
      
              cookie = vq->vq_descx[desc_idx].cookie;
              VQASSERT(vq, cookie != NULL, "no cookie for index %d", desc_idx);
  344         vq->vq_descx[desc_idx].cookie = NULL;
      
              return (cookie);
      }
      
      void *
      virtqueue_poll(struct virtqueue *vq, uint32_t *len)
      {
              void *cookie;
      
              VIRTIO_BUS_POLL(vq->vq_dev);
              while ((cookie = virtqueue_dequeue(vq, len)) == NULL) {
                      cpu_spinwait();
                      VIRTIO_BUS_POLL(vq->vq_dev);
              }
      
              return (cookie);
      }
      
      void *
      virtqueue_drain(struct virtqueue *vq, int *last)
      {
              void *cookie;
              int idx;
      
              cookie = NULL;
              idx = *last;
      
              while (idx < vq->vq_nentries && cookie == NULL) {
                      if ((cookie = vq->vq_descx[idx].cookie) != NULL) {
                              vq->vq_descx[idx].cookie = NULL;
                              /* Free chain to keep free count consistent. */
                              vq_ring_free_chain(vq, idx);
                      }
                      idx++;
              }
      
              *last = idx;
      
              return (cookie);
      }
      
      void
      virtqueue_dump(struct virtqueue *vq)
      {
      
              if (vq == NULL)
                      return;
      
              printf("VQ: %s - size=%d; free=%d; used=%d; queued=%d; "
                  "desc_head_idx=%d; avail.idx=%d; used_cons_idx=%d; "
                  "used.idx=%d; used_event_idx=%d; avail.flags=0x%x; used.flags=0x%x\n",
                  vq->vq_name, vq->vq_nentries, vq->vq_free_cnt,
                  virtqueue_nused(vq), vq->vq_queued_cnt, vq->vq_desc_head_idx,
                  vq->vq_ring.avail->idx, vq->vq_used_cons_idx,
                  vq->vq_ring.used->idx,
                      vring_used_event(&vq->vq_ring),
                  vq->vq_ring.avail->flags,
                  vq->vq_ring.used->flags);
      }
      
      static void
      vq_ring_init(struct virtqueue *vq)
      {
              struct vring *vr;
              char *ring_mem;
              int i, size;
      
              ring_mem = vq->vq_ring_mem;
              size = vq->vq_nentries;
              vr = &vq->vq_ring;
      
              vring_init(vr, size, ring_mem, vq->vq_alignment);
      
              for (i = 0; i < size - 1; i++)
                      vr->desc[i].next = i + 1;
              vr->desc[i].next = VQ_RING_DESC_CHAIN_END;
      }
      
      static void
      vq_ring_update_avail(struct virtqueue *vq, uint16_t desc_idx)
      {
              uint16_t avail_idx;
      
              /*
               * Place the head of the descriptor chain into the next slot and make
               * it usable to the host. The chain is made available now rather than
               * deferring to virtqueue_notify() in the hopes that if the host is
               * currently running on another CPU, we can keep it processing the new
               * descriptor.
               */
              avail_idx = vq->vq_ring.avail->idx & (vq->vq_nentries - 1);
              vq->vq_ring.avail->ring[avail_idx] = desc_idx;
      
              wmb();
              vq->vq_ring.avail->idx++;
      
              /* Keep pending count until virtqueue_notify(). */
              vq->vq_queued_cnt++;
      }
      
      static uint16_t
      vq_ring_enqueue_segments(struct virtqueue *vq, struct vring_desc *desc,
          uint16_t head_idx, struct sglist *sg, int readable, int writable)
      {
              struct sglist_seg *seg;
              struct vring_desc *dp;
              int i, needed;
              uint16_t idx;
      
              needed = readable + writable;
      
 3711         for (i = 0, idx = head_idx, seg = sg->sg_segs;
                   i < needed;
                   i++, idx = dp->next, seg++) {
                      VQASSERT(vq, idx != VQ_RING_DESC_CHAIN_END,
                          "premature end of free desc chain");
      
                      dp = &desc[idx];
                      dp->addr = seg->ss_paddr;
                      dp->len = seg->ss_len;
                      dp->flags = 0;
      
                      if (i < needed - 1)
                              dp->flags |= VRING_DESC_F_NEXT;
                      if (i >= readable)
                              dp->flags |= VRING_DESC_F_WRITE;
              }
      
              return (idx);
      }
      
      static int
      vq_ring_use_indirect(struct virtqueue *vq, int needed)
      {
      
 3711         if ((vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT) == 0)
                      return (0);
      
              if (vq->vq_max_indirect_size < needed)
                      return (0);
      
              if (needed < 2)
                      return (0);
      
              return (1);
      }
      
      static void
      vq_ring_enqueue_indirect(struct virtqueue *vq, void *cookie,
          struct sglist *sg, int readable, int writable)
      {
              struct vring_desc *dp;
              struct vq_desc_extra *dxp;
              int needed;
              uint16_t head_idx;
      
              needed = readable + writable;
              VQASSERT(vq, needed <= vq->vq_max_indirect_size,
                  "enqueuing too many indirect descriptors");
      
              head_idx = vq->vq_desc_head_idx;
              VQ_RING_ASSERT_VALID_IDX(vq, head_idx);
              dp = &vq->vq_ring.desc[head_idx];
              dxp = &vq->vq_descx[head_idx];
      
              VQASSERT(vq, dxp->cookie == NULL,
                  "cookie already exists for index %d", head_idx);
              dxp->cookie = cookie;
              dxp->ndescs = 1;
      
              dp->addr = dxp->indirect_paddr;
              dp->len = needed * sizeof(struct vring_desc);
              dp->flags = VRING_DESC_F_INDIRECT;
      
              vq_ring_enqueue_segments(vq, dxp->indirect, 0,
                  sg, readable, writable);
      
              vq->vq_desc_head_idx = dp->next;
              vq->vq_free_cnt--;
              if (vq->vq_free_cnt == 0)
                      VQ_RING_ASSERT_CHAIN_TERM(vq);
              else
                      VQ_RING_ASSERT_VALID_IDX(vq, vq->vq_desc_head_idx);
      
              vq_ring_update_avail(vq, head_idx);
      }
      
      static int
      vq_ring_enable_interrupt(struct virtqueue *vq, uint16_t ndesc)
      {
      
              /*
               * Enable interrupts, making sure we get the latest index of
               * what's already been consumed.
               */
              if (vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX)
                      vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx + ndesc;
              else
                      vq->vq_ring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
      
              mb();
      
              /*
               * Enough items may have already been consumed to meet our threshold
               * since we last checked. Let our caller know so it processes the new
               * entries.
               */
              if (virtqueue_nused(vq) > ndesc)
                      return (1);
      
              return (0);
      }
      
      static int
      vq_ring_must_notify_host(struct virtqueue *vq)
      {
              uint16_t new_idx, prev_idx, event_idx;
      
              if (vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX) {
 1539                 new_idx = vq->vq_ring.avail->idx;
                      prev_idx = new_idx - vq->vq_queued_cnt;
                      event_idx = vring_avail_event(&vq->vq_ring);
      
                      return (vring_need_event(event_idx, new_idx, prev_idx) != 0);
              }
      
 2188         return ((vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) == 0);
      }
      
      static void
      vq_ring_notify_host(struct virtqueue *vq)
      {
      
              VIRTIO_BUS_NOTIFY_VQ(vq->vq_dev, vq->vq_queue_index);
      }
      
      static void
      vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
  344 {
              struct vring_desc *dp;
              struct vq_desc_extra *dxp;
      
              VQ_RING_ASSERT_VALID_IDX(vq, desc_idx);
              dp = &vq->vq_ring.desc[desc_idx];
              dxp = &vq->vq_descx[desc_idx];
      
  344         if (vq->vq_free_cnt == 0)
                      VQ_RING_ASSERT_CHAIN_TERM(vq);
      
              vq->vq_free_cnt += dxp->ndescs;
              dxp->ndescs--;
      
              if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
   48                 while (dp->flags & VRING_DESC_F_NEXT) {
                              VQ_RING_ASSERT_VALID_IDX(vq, dp->next);
  344                         dp = &vq->vq_ring.desc[dp->next];
                              dxp->ndescs--;
                      }
              }
      
              VQASSERT(vq, dxp->ndescs == 0,
                  "failed to free entire desc chain, remaining: %d", dxp->ndescs);
      
              /*
               * We must append the existing free chain, if any, to the end of
               * newly freed chain. If the virtqueue was completely used, then
               * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
               */
              dp->next = vq->vq_desc_head_idx;
              vq->vq_desc_head_idx = desc_idx;
      }
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1985, 1986, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)in_var.h        8.2 (Berkeley) 1/9/95
       * $FreeBSD$
       */
      
      #ifndef _NETINET_IN_VAR_H_
      #define _NETINET_IN_VAR_H_
      
      /*
       * Argument structure for SIOCAIFADDR.
       */
      struct        in_aliasreq {
              char        ifra_name[IFNAMSIZ];                /* if name, e.g. "en0" */
              struct        sockaddr_in ifra_addr;
              struct        sockaddr_in ifra_broadaddr;
      #define ifra_dstaddr ifra_broadaddr
              struct        sockaddr_in ifra_mask;
              int        ifra_vhid;
      };
      
      #ifdef _KERNEL
      #include <sys/queue.h>
      #include <sys/fnv_hash.h>
      #include <sys/tree.h>
      
      struct igmp_ifsoftc;
      struct in_multi;
      struct lltable;
      SLIST_HEAD(in_multi_head, in_multi);
      
      /*
       * IPv4 per-interface state.
       */
      struct in_ifinfo {
              struct lltable                *ii_llt;        /* ARP state */
              struct igmp_ifsoftc        *ii_igmp;        /* IGMP state */
              struct in_multi                *ii_allhosts;        /* 224.0.0.1 membership */
      };
      
      /*
       * Interface address, Internet version.  One of these structures
       * is allocated for each Internet address on an interface.
       * The ifaddr structure contains the protocol-independent part
       * of the structure and is assumed to be first.
       */
      struct in_ifaddr {
              struct        ifaddr ia_ifa;                /* protocol-independent info */
      #define        ia_ifp                ia_ifa.ifa_ifp
      #define ia_flags        ia_ifa.ifa_flags
                                              /* ia_subnet{,mask} in host order */
              u_long        ia_subnet;                /* subnet address */
              u_long        ia_subnetmask;                /* mask of subnet */
              LIST_ENTRY(in_ifaddr) ia_hash;        /* entry in bucket of inet addresses */
              CK_STAILQ_ENTRY(in_ifaddr) ia_link;        /* list of internet addresses */
              struct        sockaddr_in ia_addr;        /* reserve space for interface name */
              struct        sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
      #define        ia_broadaddr        ia_dstaddr
              struct        sockaddr_in ia_sockmask; /* reserve space for general netmask */
              struct        callout ia_garp_timer;        /* timer for retransmitting GARPs */
              int        ia_garp_count;                /* count of retransmitted GARPs */
      };
      
      /*
       * Given a pointer to an in_ifaddr (ifaddr),
       * return a pointer to the addr as a sockaddr_in.
       */
      #define IA_SIN(ia)    (&(((struct in_ifaddr *)(ia))->ia_addr))
      #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr))
      #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask))
      
      #define IN_LNAOF(in, ifa) \
              ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask))
      
      extern        u_char        inetctlerrmap[];
      
      #define LLTABLE(ifp)        \
              ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt
      /*
       * Hash table for IP addresses.
       */
      CK_STAILQ_HEAD(in_ifaddrhead, in_ifaddr);
      LIST_HEAD(in_ifaddrhashhead, in_ifaddr);
      
      VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl);
      VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead);
      VNET_DECLARE(u_long, in_ifaddrhmask);                /* mask for hash table */
      
      #define        V_in_ifaddrhashtbl        VNET(in_ifaddrhashtbl)
      #define        V_in_ifaddrhead                VNET(in_ifaddrhead)
      #define        V_in_ifaddrhmask        VNET(in_ifaddrhmask)
      
      #define INADDR_NHASH_LOG2       9
      #define INADDR_NHASH                (1 << INADDR_NHASH_LOG2)
      #define INADDR_HASHVAL(x)        fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT)
      #define INADDR_HASH(x) \
              (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask])
      
      extern        struct rmlock in_ifaddr_lock;
      
      #define        IN_IFADDR_LOCK_ASSERT()        rm_assert(&in_ifaddr_lock, RA_LOCKED)
      #define        IN_IFADDR_RLOCK(t)        rm_rlock(&in_ifaddr_lock, (t))
      #define        IN_IFADDR_RLOCK_ASSERT()        rm_assert(&in_ifaddr_lock, RA_RLOCKED)
      #define        IN_IFADDR_RUNLOCK(t)        rm_runlock(&in_ifaddr_lock, (t))
      #define        IN_IFADDR_WLOCK()        rm_wlock(&in_ifaddr_lock)
      #define        IN_IFADDR_WLOCK_ASSERT()        rm_assert(&in_ifaddr_lock, RA_WLOCKED)
      #define        IN_IFADDR_WUNLOCK()        rm_wunlock(&in_ifaddr_lock)
      
      /*
       * Macro for finding the internet address structure (in_ifaddr)
       * corresponding to one of our IP addresses (in_addr).
       */
      #define INADDR_TO_IFADDR(addr, ia) \
              /* struct in_addr addr; */ \
              /* struct in_ifaddr *ia; */ \
      do { \
      \
              LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \
                      if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \
                              break; \
      } while (0)
      
      /*
       * Macro for finding the interface (ifnet structure) corresponding to one
       * of our IP addresses.
       */
      #define INADDR_TO_IFP(addr, ifp) \
              /* struct in_addr addr; */ \
              /* struct ifnet *ifp; */ \
      { \
              struct in_ifaddr *ia; \
      \
              INADDR_TO_IFADDR(addr, ia); \
              (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \
      }
      
      /*
       * Macro for finding the internet address structure (in_ifaddr) corresponding
       * to a given interface (ifnet structure).
       */
      #define IFP_TO_IA(ifp, ia, t)                                                \
              /* struct ifnet *ifp; */                                        \
              /* struct in_ifaddr *ia; */                                        \
              /* struct rm_priotracker *t; */                                        \
      do {                                                                        \
              NET_EPOCH_ASSERT();                                                \
              IN_IFADDR_RLOCK((t));                                                \
              for ((ia) = CK_STAILQ_FIRST(&V_in_ifaddrhead);                        \
                  (ia) != NULL && (ia)->ia_ifp != (ifp);                        \
                  (ia) = CK_STAILQ_NEXT((ia), ia_link))                                \
                      continue;                                                \
              IN_IFADDR_RUNLOCK((t));                                                \
      } while (0)
      
      /*
       * Legacy IPv4 IGMP per-link structure.
       */
      struct router_info {
              struct ifnet *rti_ifp;
              int    rti_type; /* type of router which is querier on this interface */
              int    rti_time; /* # of slow timeouts since last old query */
              SLIST_ENTRY(router_info) rti_list;
      };
      
      /*
       * IPv4 multicast IGMP-layer source entry.
       */
      struct ip_msource {
              RB_ENTRY(ip_msource)        ims_link;        /* RB tree links */
              in_addr_t                ims_haddr;        /* host byte order */
              struct ims_st {
                      uint16_t        ex;                /* # of exclusive members */
                      uint16_t        in;                /* # of inclusive members */
              }                        ims_st[2];        /* state at t0, t1 */
              uint8_t                        ims_stp;        /* pending query */
      };
      
      /*
       * IPv4 multicast PCB-layer source entry.
       */
      struct in_msource {
              RB_ENTRY(ip_msource)        ims_link;        /* RB tree links */
              in_addr_t                ims_haddr;        /* host byte order */
              uint8_t                        imsl_st[2];        /* state before/at commit */
      };
      
      RB_HEAD(ip_msource_tree, ip_msource);        /* define struct ip_msource_tree */
      
      static __inline int
      ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b)
      {
      
              if (a->ims_haddr < b->ims_haddr)
                      return (-1);
              if (a->ims_haddr == b->ims_haddr)
                      return (0);
              return (1);
      }
      RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
      
      /*
       * IPv4 multicast PCB-layer group filter descriptor.
       */
      struct in_mfilter {
              struct ip_msource_tree        imf_sources; /* source list for (S,G) */
              u_long                        imf_nsrc;    /* # of source entries */
              uint8_t                        imf_st[2];   /* state before/at commit */
              struct in_multi               *imf_inm;     /* associated multicast address */
              STAILQ_ENTRY(in_mfilter) imf_entry;  /* list entry */
      };
      
      /*
       * Helper types and functions for IPv4 multicast filters.
       */
      STAILQ_HEAD(ip_mfilter_head, in_mfilter);
      
      struct in_mfilter *ip_mfilter_alloc(int mflags, int st0, int st1);
      void ip_mfilter_free(struct in_mfilter *);
      
      static inline void
      ip_mfilter_init(struct ip_mfilter_head *head)
      {
      
              STAILQ_INIT(head);
      }
      
      static inline struct in_mfilter *
      ip_mfilter_first(const struct ip_mfilter_head *head)
      {
      
              return (STAILQ_FIRST(head));
      }
      
      static inline void
      ip_mfilter_insert(struct ip_mfilter_head *head, struct in_mfilter *imf)
      {
      
              STAILQ_INSERT_TAIL(head, imf, imf_entry);
      }
      
      static inline void
      ip_mfilter_remove(struct ip_mfilter_head *head, struct in_mfilter *imf)
      {
      
    9         STAILQ_REMOVE(head, imf, in_mfilter, imf_entry);
      }
      
      #define        IP_MFILTER_FOREACH(imf, head) \
              STAILQ_FOREACH(imf, head, imf_entry)
      
      static inline size_t
      ip_mfilter_count(struct ip_mfilter_head *head)
      {
              struct in_mfilter *imf;
              size_t num = 0;
      
    1         STAILQ_FOREACH(imf, head, imf_entry)
                      num++;
              return (num);
      }
      
      /*
       * IPv4 group descriptor.
       *
       * For every entry on an ifnet's if_multiaddrs list which represents
       * an IP multicast group, there is one of these structures.
       *
       * If any source filters are present, then a node will exist in the RB-tree
       * to permit fast lookup by source whenever an operation takes place.
       * This permits pre-order traversal when we issue reports.
       * Source filter trees are kept separately from the socket layer to
       * greatly simplify locking.
       *
       * When IGMPv3 is active, inm_timer is the response to group query timer.
       * The state-change timer inm_sctimer is separate; whenever state changes
       * for the group the state change record is generated and transmitted,
       * and kept if retransmissions are necessary.
       *
       * FUTURE: inm_link is now only used when groups are being purged
       * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but
       * because it is at the very start of the struct, we can't do this
       * w/o breaking the ABI for ifmcstat.
       */
      struct in_multi {
              LIST_ENTRY(in_multi) inm_link;        /* to-be-released by in_ifdetach */
              struct        in_addr inm_addr;        /* IP multicast address, convenience */
              struct        ifnet *inm_ifp;                /* back pointer to ifnet */
              struct        ifmultiaddr *inm_ifma;        /* back pointer to ifmultiaddr */
              u_int        inm_timer;                /* IGMPv1/v2 group / v3 query timer */
              u_int        inm_state;                /* state of the membership */
              void        *inm_rti;                /* unused, legacy field */
              u_int        inm_refcount;                /* reference count */
      
              /* New fields for IGMPv3 follow. */
              struct igmp_ifsoftc        *inm_igi;        /* IGMP info */
              SLIST_ENTRY(in_multi)         inm_nrele;        /* to-be-released by IGMP */
              struct ip_msource_tree         inm_srcs;        /* tree of sources */
              u_long                         inm_nsrc;        /* # of tree entries */
      
              struct mbufq                 inm_scq;        /* queue of pending
                                                       * state-change packets */
              struct timeval                 inm_lastgsrtv;        /* Time of last G-S-R query */
              uint16_t                 inm_sctimer;        /* state-change timer */
              uint16_t                 inm_scrv;        /* state-change rexmit count */
      
              /*
               * SSM state counters which track state at T0 (the time the last
               * state-change report's RV timer went to zero) and T1
               * (time of pending report, i.e. now).
               * Used for computing IGMPv3 state-change reports. Several refcounts
               * are maintained here to optimize for common use-cases.
               */
              struct inm_st {
                      uint16_t        iss_fmode;        /* IGMP filter mode */
                      uint16_t        iss_asm;        /* # of ASM listeners */
                      uint16_t        iss_ex;                /* # of exclusive members */
                      uint16_t        iss_in;                /* # of inclusive members */
                      uint16_t        iss_rec;        /* # of recorded sources */
              }                        inm_st[2];        /* state at t0, t1 */
      };
      
      /*
       * Helper function to derive the filter mode on a source entry
       * from its internal counters. Predicates are:
       *  A source is only excluded if all listeners exclude it.
       *  A source is only included if no listeners exclude it,
       *  and at least one listener includes it.
       * May be used by ifmcstat(8).
       */
      static __inline uint8_t
      ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims,
          uint8_t t)
      {
      
              t = !!t;
    4         if (inm->inm_st[t].iss_ex > 0 &&
                  inm->inm_st[t].iss_ex == ims->ims_st[t].ex)
                      return (MCAST_EXCLUDE);
    3         else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0)
                      return (MCAST_INCLUDE);
              return (MCAST_UNDEFINED);
      }
      
      #ifdef SYSCTL_DECL
      SYSCTL_DECL(_net_inet);
      SYSCTL_DECL(_net_inet_ip);
      SYSCTL_DECL(_net_inet_raw);
      #endif
      
      /*
       * Lock macros for IPv4 layer multicast address lists.  IPv4 lock goes
       * before link layer multicast locks in the lock order.  In most cases,
       * consumers of IN_*_MULTI() macros should acquire the locks before
       * calling them; users of the in_{add,del}multi() functions should not.
       */
      extern struct mtx in_multi_list_mtx;
      extern struct sx in_multi_sx;
      
      #define        IN_MULTI_LIST_LOCK()                mtx_lock(&in_multi_list_mtx)
      #define        IN_MULTI_LIST_UNLOCK()        mtx_unlock(&in_multi_list_mtx)
      #define        IN_MULTI_LIST_LOCK_ASSERT()        mtx_assert(&in_multi_list_mtx, MA_OWNED)
      #define        IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED)
      
      #define        IN_MULTI_LOCK()                sx_xlock(&in_multi_sx)
      #define        IN_MULTI_UNLOCK()        sx_xunlock(&in_multi_sx)
      #define        IN_MULTI_LOCK_ASSERT()        sx_assert(&in_multi_sx, SA_XLOCKED)
      #define        IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED)
      
      void inm_disconnect(struct in_multi *inm);
      extern int ifma_restart;
      
      /* Acquire an in_multi record. */
      static __inline void
      inm_acquire_locked(struct in_multi *inm)
      {
      
              IN_MULTI_LIST_LOCK_ASSERT();
              ++inm->inm_refcount;
      }
      
      static __inline void
      inm_acquire(struct in_multi *inm)
      {
              IN_MULTI_LIST_LOCK();
              inm_acquire_locked(inm);
              IN_MULTI_LIST_UNLOCK();
      }
      
      static __inline void
      inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm)
      {
              MPASS(inm->inm_refcount > 0);
              IN_MULTI_LIST_LOCK_ASSERT();
      
    1         if (--inm->inm_refcount == 0) {
                      MPASS(inmh != NULL);
                      inm_disconnect(inm);
                      inm->inm_ifma->ifma_protospec = NULL;
                      SLIST_INSERT_HEAD(inmh, inm, inm_nrele);
              }
      }
      
      /*
       * Return values for imo_multi_filter().
       */
      #define MCAST_PASS                0        /* Pass */
      #define MCAST_NOTGMEMBER        1        /* This host not a member of group */
      #define MCAST_NOTSMEMBER        2        /* This host excluded source */
      #define MCAST_MUTED                3        /* [deprecated] */
      
      struct rib_head;
      struct        ip_moptions;
      
      struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr);
      struct in_multi *inm_lookup(struct ifnet *, const struct in_addr);
      int        imo_multi_filter(const struct ip_moptions *, const struct ifnet *,
                  const struct sockaddr *, const struct sockaddr *);
      void        inm_commit(struct in_multi *);
      void        inm_clear_recorded(struct in_multi *);
      void        inm_print(const struct in_multi *);
      int        inm_record_source(struct in_multi *inm, const in_addr_t);
      void        inm_release_deferred(struct in_multi *);
      void        inm_release_list_deferred(struct in_multi_head *);
      void        inm_release_wait(void *);
      struct        in_multi *
      in_addmulti(struct in_addr *, struct ifnet *);
      int        in_joingroup(struct ifnet *, const struct in_addr *,
                  /*const*/ struct in_mfilter *, struct in_multi **);
      int        in_joingroup_locked(struct ifnet *, const struct in_addr *,
                  /*const*/ struct in_mfilter *, struct in_multi **);
      int        in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *);
      int        in_leavegroup_locked(struct in_multi *,
                  /*const*/ struct in_mfilter *);
      int        in_control(struct socket *, u_long, caddr_t, struct ifnet *,
                  struct thread *);
      int        in_addprefix(struct in_ifaddr *, int);
      int        in_scrubprefix(struct in_ifaddr *, u_int);
      void        in_ifscrub_all(void);
      void        ip_input(struct mbuf *);
      void        ip_direct_input(struct mbuf *);
      void        in_ifadown(struct ifaddr *ifa, int);
      struct        mbuf        *ip_tryforward(struct mbuf *);
      void        *in_domifattach(struct ifnet *);
      void        in_domifdetach(struct ifnet *, void *);
      struct rib_head *in_inithead(uint32_t fibnum);
      #ifdef VIMAGE
      void        in_detachhead(struct rib_head *rh);
      #endif
      
      #endif /* _KERNEL */
      
      /* INET6 stuff */
      #include <netinet6/in6_var.h>
      
      #endif /* _NETINET_IN_VAR_H_ */
      /*-
       * SPDX-License-Identifier: BSD-2-Clause
       *
       * Copyright (c) 2014-2019 Netflix Inc.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_inet.h"
      #include "opt_inet6.h"
      #include "opt_rss.h"
      
      #include <sys/param.h>
      #include <sys/kernel.h>
      #include <sys/ktls.h>
      #include <sys/lock.h>
      #include <sys/mbuf.h>
      #include <sys/mutex.h>
      #include <sys/rmlock.h>
      #include <sys/proc.h>
      #include <sys/protosw.h>
      #include <sys/refcount.h>
      #include <sys/smp.h>
      #include <sys/socket.h>
      #include <sys/socketvar.h>
      #include <sys/sysctl.h>
      #include <sys/taskqueue.h>
      #include <sys/kthread.h>
      #include <sys/uio.h>
      #include <sys/vmmeter.h>
      #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
      #include <machine/pcb.h>
      #endif
      #include <machine/vmparam.h>
      #include <net/if.h>
      #include <net/if_var.h>
      #ifdef RSS
      #include <net/netisr.h>
      #include <net/rss_config.h>
      #endif
      #include <net/route.h>
      #include <net/route/nhop.h>
      #if defined(INET) || defined(INET6)
      #include <netinet/in.h>
      #include <netinet/in_pcb.h>
      #endif
      #include <netinet/tcp_var.h>
      #ifdef TCP_OFFLOAD
      #include <netinet/tcp_offload.h>
      #endif
      #include <opencrypto/xform.h>
      #include <vm/uma_dbg.h>
      #include <vm/vm.h>
      #include <vm/vm_pageout.h>
      #include <vm/vm_page.h>
      
      struct ktls_wq {
              struct mtx        mtx;
              STAILQ_HEAD(, mbuf) m_head;
              STAILQ_HEAD(, socket) so_head;
              bool                running;
      } __aligned(CACHE_LINE_SIZE);
      
      static struct ktls_wq *ktls_wq;
      static struct proc *ktls_proc;
      LIST_HEAD(, ktls_crypto_backend) ktls_backends;
      static struct rmlock ktls_backends_lock;
      static uma_zone_t ktls_session_zone;
      static uint16_t ktls_cpuid_lookup[MAXCPU];
      
      SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Kernel TLS offload");
      SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Kernel TLS offload stats");
      
      static int ktls_allow_unload;
      SYSCTL_INT(_kern_ipc_tls, OID_AUTO, allow_unload, CTLFLAG_RDTUN,
          &ktls_allow_unload, 0, "Allow software crypto modules to unload");
      
      #ifdef RSS
      static int ktls_bind_threads = 1;
      #else
      static int ktls_bind_threads;
      #endif
      SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
          &ktls_bind_threads, 0,
          "Bind crypto threads to cores or domains at boot");
      
      static u_int ktls_maxlen = 16384;
      SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN,
          &ktls_maxlen, 0, "Maximum TLS record size");
      
      static int ktls_number_threads;
      SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
          &ktls_number_threads, 0,
          "Number of TLS threads in thread-pool");
      
      static bool ktls_offload_enable;
      SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RW,
          &ktls_offload_enable, 0,
          "Enable support for kernel TLS offload");
      
      static bool ktls_cbc_enable = true;
      SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RW,
          &ktls_cbc_enable, 1,
          "Enable Support of AES-CBC crypto for kernel TLS");
      
      static counter_u64_t ktls_tasks_active;
      SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
          &ktls_tasks_active, "Number of active tasks");
      
      static counter_u64_t ktls_cnt_tx_queued;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
          &ktls_cnt_tx_queued,
          "Number of TLS records in queue to tasks for SW encryption");
      
      static counter_u64_t ktls_cnt_rx_queued;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
          &ktls_cnt_rx_queued,
          "Number of TLS sockets in queue to tasks for SW decryption");
      
      static counter_u64_t ktls_offload_total;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
          CTLFLAG_RD, &ktls_offload_total,
          "Total successful TLS setups (parameters set)");
      
      static counter_u64_t ktls_offload_enable_calls;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
          CTLFLAG_RD, &ktls_offload_enable_calls,
          "Total number of TLS enable calls made");
      
      static counter_u64_t ktls_offload_active;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
          &ktls_offload_active, "Total Active TLS sessions");
      
      static counter_u64_t ktls_offload_corrupted_records;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
          &ktls_offload_corrupted_records, "Total corrupted TLS records received");
      
      static counter_u64_t ktls_offload_failed_crypto;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
          &ktls_offload_failed_crypto, "Total TLS crypto failures");
      
      static counter_u64_t ktls_switch_to_ifnet;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
          &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
      
      static counter_u64_t ktls_switch_to_sw;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
          &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
      
      static counter_u64_t ktls_switch_failed;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
          &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
      
      SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
          "Software TLS session stats");
      SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
          "Hardware (ifnet) TLS session stats");
      #ifdef TCP_OFFLOAD
      SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
          "TOE TLS session stats");
      #endif
      
      static counter_u64_t ktls_sw_cbc;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
          "Active number of software TLS sessions using AES-CBC");
      
      static counter_u64_t ktls_sw_gcm;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
          "Active number of software TLS sessions using AES-GCM");
      
      static counter_u64_t ktls_ifnet_cbc;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
          &ktls_ifnet_cbc,
          "Active number of ifnet TLS sessions using AES-CBC");
      
      static counter_u64_t ktls_ifnet_gcm;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
          &ktls_ifnet_gcm,
          "Active number of ifnet TLS sessions using AES-GCM");
      
      static counter_u64_t ktls_ifnet_reset;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
          &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
      
      static counter_u64_t ktls_ifnet_reset_dropped;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
          &ktls_ifnet_reset_dropped,
          "TLS sessions dropped after failing to update ifnet send tag");
      
      static counter_u64_t ktls_ifnet_reset_failed;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
          &ktls_ifnet_reset_failed,
          "TLS sessions that failed to allocate a new ifnet send tag");
      
      static int ktls_ifnet_permitted;
      SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
          &ktls_ifnet_permitted, 1,
          "Whether to permit hardware (ifnet) TLS sessions");
      
      #ifdef TCP_OFFLOAD
      static counter_u64_t ktls_toe_cbc;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
          &ktls_toe_cbc,
          "Active number of TOE TLS sessions using AES-CBC");
      
      static counter_u64_t ktls_toe_gcm;
      SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
          &ktls_toe_gcm,
          "Active number of TOE TLS sessions using AES-GCM");
      #endif
      
      static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
      
      static void ktls_cleanup(struct ktls_session *tls);
      #if defined(INET) || defined(INET6)
      static void ktls_reset_send_tag(void *context, int pending);
      #endif
      static void ktls_work_thread(void *ctx);
      
      int
      ktls_crypto_backend_register(struct ktls_crypto_backend *be)
      {
              struct ktls_crypto_backend *curr_be, *tmp;
      
              if (be->api_version != KTLS_API_VERSION) {
                      printf("KTLS: API version mismatch (%d vs %d) for %s\n",
                          be->api_version, KTLS_API_VERSION,
                          be->name);
                      return (EINVAL);
              }
      
              rm_wlock(&ktls_backends_lock);
              printf("KTLS: Registering crypto method %s with prio %d\n",
                     be->name, be->prio);
              if (LIST_EMPTY(&ktls_backends)) {
                      LIST_INSERT_HEAD(&ktls_backends, be, next);
              } else {
                      LIST_FOREACH_SAFE(curr_be, &ktls_backends, next, tmp) {
                              if (curr_be->prio < be->prio) {
                                      LIST_INSERT_BEFORE(curr_be, be, next);
                                      break;
                              }
                              if (LIST_NEXT(curr_be, next) == NULL) {
                                      LIST_INSERT_AFTER(curr_be, be, next);
                                      break;
                              }
                      }
              }
              rm_wunlock(&ktls_backends_lock);
              return (0);
      }
      
      int
      ktls_crypto_backend_deregister(struct ktls_crypto_backend *be)
      {
              struct ktls_crypto_backend *tmp;
      
              /*
               * Don't error if the backend isn't registered.  This permits
               * MOD_UNLOAD handlers to use this function unconditionally.
               */
              rm_wlock(&ktls_backends_lock);
              LIST_FOREACH(tmp, &ktls_backends, next) {
                      if (tmp == be)
                              break;
              }
              if (tmp == NULL) {
                      rm_wunlock(&ktls_backends_lock);
                      return (0);
              }
      
              if (!ktls_allow_unload) {
                      rm_wunlock(&ktls_backends_lock);
                      printf(
                          "KTLS: Deregistering crypto method %s is not supported\n",
                          be->name);
                      return (EBUSY);
              }
      
              if (be->use_count) {
                      rm_wunlock(&ktls_backends_lock);
                      return (EBUSY);
              }
      
              LIST_REMOVE(be, next);
              rm_wunlock(&ktls_backends_lock);
              return (0);
      }
      
      #if defined(INET) || defined(INET6)
      static u_int
      ktls_get_cpu(struct socket *so)
      {
              struct inpcb *inp;
              u_int cpuid;
      
              inp = sotoinpcb(so);
      #ifdef RSS
              cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
              if (cpuid != NETISR_CPUID_NONE)
                      return (cpuid);
      #endif
              /*
               * Just use the flowid to shard connections in a repeatable
               * fashion.  Note that some crypto backends rely on the
               * serialization provided by having the same connection use
               * the same queue.
               */
              cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
              return (cpuid);
      }
      #endif
      
      static void
      ktls_init(void *dummy __unused)
      {
              struct thread *td;
              struct pcpu *pc;
              cpuset_t mask;
              int error, i;
      
              ktls_tasks_active = counter_u64_alloc(M_WAITOK);
              ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK);
              ktls_cnt_rx_queued = counter_u64_alloc(M_WAITOK);
              ktls_offload_total = counter_u64_alloc(M_WAITOK);
              ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK);
              ktls_offload_active = counter_u64_alloc(M_WAITOK);
              ktls_offload_corrupted_records = counter_u64_alloc(M_WAITOK);
              ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK);
              ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK);
              ktls_switch_to_sw = counter_u64_alloc(M_WAITOK);
              ktls_switch_failed = counter_u64_alloc(M_WAITOK);
              ktls_sw_cbc = counter_u64_alloc(M_WAITOK);
              ktls_sw_gcm = counter_u64_alloc(M_WAITOK);
              ktls_ifnet_cbc = counter_u64_alloc(M_WAITOK);
              ktls_ifnet_gcm = counter_u64_alloc(M_WAITOK);
              ktls_ifnet_reset = counter_u64_alloc(M_WAITOK);
              ktls_ifnet_reset_dropped = counter_u64_alloc(M_WAITOK);
              ktls_ifnet_reset_failed = counter_u64_alloc(M_WAITOK);
      #ifdef TCP_OFFLOAD
              ktls_toe_cbc = counter_u64_alloc(M_WAITOK);
              ktls_toe_gcm = counter_u64_alloc(M_WAITOK);
      #endif
      
              rm_init(&ktls_backends_lock, "ktls backends");
              LIST_INIT(&ktls_backends);
      
              ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
                  M_WAITOK | M_ZERO);
      
              ktls_session_zone = uma_zcreate("ktls_session",
                  sizeof(struct ktls_session),
                  NULL, NULL, NULL, NULL,
                  UMA_ALIGN_CACHE, 0);
      
              /*
               * Initialize the workqueues to run the TLS work.  We create a
               * work queue for each CPU.
               */
              CPU_FOREACH(i) {
                      STAILQ_INIT(&ktls_wq[i].m_head);
                      STAILQ_INIT(&ktls_wq[i].so_head);
                      mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
                      error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
                          &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
                      if (error)
                              panic("Can't add KTLS thread %d error %d", i, error);
      
                      /*
                       * Bind threads to cores.  If ktls_bind_threads is >
                       * 1, then we bind to the NUMA domain.
                       */
                      if (ktls_bind_threads) {
                              if (ktls_bind_threads > 1) {
                                      pc = pcpu_find(i);
                                      CPU_COPY(&cpuset_domain[pc->pc_domain], &mask);
                              } else {
                                      CPU_SETOF(i, &mask);
                              }
                              error = cpuset_setthread(td->td_tid, &mask);
                              if (error)
                                      panic(
                                  "Unable to bind KTLS thread for CPU %d error %d",
                                           i, error);
                      }
                      ktls_cpuid_lookup[ktls_number_threads] = i;
                      ktls_number_threads++;
              }
              printf("KTLS: Initialized %d threads\n", ktls_number_threads);
      }
      SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
      
      #if defined(INET) || defined(INET6)
      static int
      ktls_create_session(struct socket *so, struct tls_enable *en,
          struct ktls_session **tlsp)
      {
              struct ktls_session *tls;
              int error;
      
              /* Only TLS 1.0 - 1.3 are supported. */
              if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
                      return (EINVAL);
              if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
                  en->tls_vminor > TLS_MINOR_VER_THREE)
                      return (EINVAL);
      
              if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
                      return (EINVAL);
              if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
                      return (EINVAL);
              if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
                      return (EINVAL);
      
              /* All supported algorithms require a cipher key. */
              if (en->cipher_key_len == 0)
                      return (EINVAL);
      
              /* No flags are currently supported. */
              if (en->flags != 0)
                      return (EINVAL);
      
              /* Common checks for supported algorithms. */
              switch (en->cipher_algorithm) {
              case CRYPTO_AES_NIST_GCM_16:
                      /*
                       * auth_algorithm isn't used, but permit GMAC values
                       * for compatibility.
                       */
                      switch (en->auth_algorithm) {
                      case 0:
      #ifdef COMPAT_FREEBSD12
                      /* XXX: Really 13.0-current COMPAT. */
                      case CRYPTO_AES_128_NIST_GMAC:
                      case CRYPTO_AES_192_NIST_GMAC:
                      case CRYPTO_AES_256_NIST_GMAC:
      #endif
                              break;
                      default:
                              return (EINVAL);
                      }
                      if (en->auth_key_len != 0)
                              return (EINVAL);
                      if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
                              en->iv_len != TLS_AEAD_GCM_LEN) ||
                          (en->tls_vminor == TLS_MINOR_VER_THREE &&
                              en->iv_len != TLS_1_3_GCM_IV_LEN))
                              return (EINVAL);
                      break;
              case CRYPTO_AES_CBC:
                      switch (en->auth_algorithm) {
                      case CRYPTO_SHA1_HMAC:
                              /*
                               * TLS 1.0 requires an implicit IV.  TLS 1.1+
                               * all use explicit IVs.
                               */
                              if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
                                      if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
                                              return (EINVAL);
                                      break;
                              }
      
                              /* FALLTHROUGH */
                      case CRYPTO_SHA2_256_HMAC:
                      case CRYPTO_SHA2_384_HMAC:
                              /* Ignore any supplied IV. */
                              en->iv_len = 0;
                              break;
                      default:
                              return (EINVAL);
                      }
                      if (en->auth_key_len == 0)
                              return (EINVAL);
                      break;
              default:
                      return (EINVAL);
              }
      
              tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
      
              counter_u64_add(ktls_offload_active, 1);
      
              refcount_init(&tls->refcount, 1);
              TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
      
              tls->wq_index = ktls_get_cpu(so);
      
              tls->params.cipher_algorithm = en->cipher_algorithm;
              tls->params.auth_algorithm = en->auth_algorithm;
              tls->params.tls_vmajor = en->tls_vmajor;
              tls->params.tls_vminor = en->tls_vminor;
              tls->params.flags = en->flags;
              tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
      
              /* Set the header and trailer lengths. */
              tls->params.tls_hlen = sizeof(struct tls_record_layer);
              switch (en->cipher_algorithm) {
              case CRYPTO_AES_NIST_GCM_16:
                      /*
                       * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
                       * nonce.  TLS 1.3 uses a 12 byte implicit IV.
                       */
                      if (en->tls_vminor < TLS_MINOR_VER_THREE)
                              tls->params.tls_hlen += sizeof(uint64_t);
                      tls->params.tls_tlen = AES_GMAC_HASH_LEN;
      
                      /*
                       * TLS 1.3 includes optional padding which we
                       * do not support, and also puts the "real" record
                       * type at the end of the encrypted data.
                       */
                      if (en->tls_vminor == TLS_MINOR_VER_THREE)
                              tls->params.tls_tlen += sizeof(uint8_t);
      
                      tls->params.tls_bs = 1;
                      break;
              case CRYPTO_AES_CBC:
                      switch (en->auth_algorithm) {
                      case CRYPTO_SHA1_HMAC:
                              if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
                                      /* Implicit IV, no nonce. */
                              } else {
                                      tls->params.tls_hlen += AES_BLOCK_LEN;
                              }
                              tls->params.tls_tlen = AES_BLOCK_LEN +
                                  SHA1_HASH_LEN;
                              break;
                      case CRYPTO_SHA2_256_HMAC:
                              tls->params.tls_hlen += AES_BLOCK_LEN;
                              tls->params.tls_tlen = AES_BLOCK_LEN +
                                  SHA2_256_HASH_LEN;
                              break;
                      case CRYPTO_SHA2_384_HMAC:
                              tls->params.tls_hlen += AES_BLOCK_LEN;
                              tls->params.tls_tlen = AES_BLOCK_LEN +
                                  SHA2_384_HASH_LEN;
                              break;
                      default:
                              panic("invalid hmac");
                      }
                      tls->params.tls_bs = AES_BLOCK_LEN;
                      break;
              default:
                      panic("invalid cipher");
              }
      
              KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
                  ("TLS header length too long: %d", tls->params.tls_hlen));
              KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
                  ("TLS trailer length too long: %d", tls->params.tls_tlen));
      
              if (en->auth_key_len != 0) {
                      tls->params.auth_key_len = en->auth_key_len;
                      tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
                          M_WAITOK);
                      error = copyin(en->auth_key, tls->params.auth_key,
                          en->auth_key_len);
                      if (error)
                              goto out;
              }
      
              tls->params.cipher_key_len = en->cipher_key_len;
              tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
              error = copyin(en->cipher_key, tls->params.cipher_key,
                  en->cipher_key_len);
              if (error)
                      goto out;
      
              /*
               * This holds the implicit portion of the nonce for GCM and
               * the initial implicit IV for TLS 1.0.  The explicit portions
               * of the IV are generated in ktls_frame().
               */
              if (en->iv_len != 0) {
                      tls->params.iv_len = en->iv_len;
                      error = copyin(en->iv, tls->params.iv, en->iv_len);
                      if (error)
                              goto out;
      
                      /*
                       * For TLS 1.2, generate an 8-byte nonce as a counter
                       * to generate unique explicit IVs.
                       *
                       * Store this counter in the last 8 bytes of the IV
                       * array so that it is 8-byte aligned.
                       */
                      if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
                          en->tls_vminor == TLS_MINOR_VER_TWO)
                              arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
              }
      
              *tlsp = tls;
              return (0);
      
      out:
              ktls_cleanup(tls);
              return (error);
      }
      
      static struct ktls_session *
      ktls_clone_session(struct ktls_session *tls)
      {
              struct ktls_session *tls_new;
      
              tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
      
              counter_u64_add(ktls_offload_active, 1);
      
              refcount_init(&tls_new->refcount, 1);
      
              /* Copy fields from existing session. */
              tls_new->params = tls->params;
              tls_new->wq_index = tls->wq_index;
      
              /* Deep copy keys. */
              if (tls_new->params.auth_key != NULL) {
                      tls_new->params.auth_key = malloc(tls->params.auth_key_len,
                          M_KTLS, M_WAITOK);
                      memcpy(tls_new->params.auth_key, tls->params.auth_key,
                          tls->params.auth_key_len);
              }
      
              tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
                  M_WAITOK);
              memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
                  tls->params.cipher_key_len);
      
              return (tls_new);
      }
      #endif
      
      static void
      ktls_cleanup(struct ktls_session *tls)
      {
      
              counter_u64_add(ktls_offload_active, -1);
              switch (tls->mode) {
              case TCP_TLS_MODE_SW:
                      MPASS(tls->be != NULL);
                      switch (tls->params.cipher_algorithm) {
                      case CRYPTO_AES_CBC:
                              counter_u64_add(ktls_sw_cbc, -1);
                              break;
                      case CRYPTO_AES_NIST_GCM_16:
                              counter_u64_add(ktls_sw_gcm, -1);
                              break;
                      }
                      tls->free(tls);
                      break;
              case TCP_TLS_MODE_IFNET:
                      switch (tls->params.cipher_algorithm) {
                      case CRYPTO_AES_CBC:
                              counter_u64_add(ktls_ifnet_cbc, -1);
                              break;
                      case CRYPTO_AES_NIST_GCM_16:
                              counter_u64_add(ktls_ifnet_gcm, -1);
                              break;
                      }
                      if (tls->snd_tag != NULL)
                              m_snd_tag_rele(tls->snd_tag);
                      break;
      #ifdef TCP_OFFLOAD
              case TCP_TLS_MODE_TOE:
                      switch (tls->params.cipher_algorithm) {
                      case CRYPTO_AES_CBC:
                              counter_u64_add(ktls_toe_cbc, -1);
                              break;
                      case CRYPTO_AES_NIST_GCM_16:
                              counter_u64_add(ktls_toe_gcm, -1);
                              break;
                      }
                      break;
      #endif
              }
              if (tls->params.auth_key != NULL) {
                      zfree(tls->params.auth_key, M_KTLS);
                      tls->params.auth_key = NULL;
                      tls->params.auth_key_len = 0;
              }
              if (tls->params.cipher_key != NULL) {
                      zfree(tls->params.cipher_key, M_KTLS);
                      tls->params.cipher_key = NULL;
                      tls->params.cipher_key_len = 0;
              }
              explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
      }
      
      #if defined(INET) || defined(INET6)
      
      #ifdef TCP_OFFLOAD
      static int
      ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
      {
              struct inpcb *inp;
              struct tcpcb *tp;
              int error;
      
              inp = so->so_pcb;
              INP_WLOCK(inp);
              if (inp->inp_flags2 & INP_FREED) {
                      INP_WUNLOCK(inp);
                      return (ECONNRESET);
              }
              if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
                      INP_WUNLOCK(inp);
                      return (ECONNRESET);
              }
              if (inp->inp_socket == NULL) {
                      INP_WUNLOCK(inp);
                      return (ECONNRESET);
              }
              tp = intotcpcb(inp);
              if (tp->tod == NULL) {
                      INP_WUNLOCK(inp);
                      return (EOPNOTSUPP);
              }
      
              error = tcp_offload_alloc_tls_session(tp, tls, direction);
              INP_WUNLOCK(inp);
              if (error == 0) {
                      tls->mode = TCP_TLS_MODE_TOE;
                      switch (tls->params.cipher_algorithm) {
                      case CRYPTO_AES_CBC:
                              counter_u64_add(ktls_toe_cbc, 1);
                              break;
                      case CRYPTO_AES_NIST_GCM_16:
                              counter_u64_add(ktls_toe_gcm, 1);
                              break;
                      }
              }
              return (error);
      }
      #endif
      
      /*
       * Common code used when first enabling ifnet TLS on a connection or
       * when allocating a new ifnet TLS session due to a routing change.
       * This function allocates a new TLS send tag on whatever interface
       * the connection is currently routed over.
       */
      static int
      ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
          struct m_snd_tag **mstp)
      {
              union if_snd_tag_alloc_params params;
              struct ifnet *ifp;
              struct nhop_object *nh;
              struct tcpcb *tp;
              int error;
      
              INP_RLOCK(inp);
              if (inp->inp_flags2 & INP_FREED) {
                      INP_RUNLOCK(inp);
                      return (ECONNRESET);
              }
              if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
                      INP_RUNLOCK(inp);
                      return (ECONNRESET);
              }
              if (inp->inp_socket == NULL) {
                      INP_RUNLOCK(inp);
                      return (ECONNRESET);
              }
              tp = intotcpcb(inp);
      
              /*
               * Check administrative controls on ifnet TLS to determine if
               * ifnet TLS should be denied.
               *
               * - Always permit 'force' requests.
               * - ktls_ifnet_permitted == 0: always deny.
               */
              if (!force && ktls_ifnet_permitted == 0) {
                      INP_RUNLOCK(inp);
                      return (ENXIO);
              }
      
              /*
               * XXX: Use the cached route in the inpcb to find the
               * interface.  This should perhaps instead use
               * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
               * enabled after a connection has completed key negotiation in
               * userland, the cached route will be present in practice.
               */
              nh = inp->inp_route.ro_nh;
              if (nh == NULL) {
                      INP_RUNLOCK(inp);
                      return (ENXIO);
              }
              ifp = nh->nh_ifp;
              if_ref(ifp);
      
              params.hdr.type = IF_SND_TAG_TYPE_TLS;
              params.hdr.flowid = inp->inp_flowid;
              params.hdr.flowtype = inp->inp_flowtype;
              params.hdr.numa_domain = inp->inp_numa_domain;
              params.tls.inp = inp;
              params.tls.tls = tls;
              INP_RUNLOCK(inp);
      
              if (ifp->if_snd_tag_alloc == NULL) {
                      error = EOPNOTSUPP;
                      goto out;
              }
              if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {        
                      error = EOPNOTSUPP;
                      goto out;
              }
              if (inp->inp_vflag & INP_IPV6) {
                      if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
                              error = EOPNOTSUPP;
                              goto out;
                      }
              } else {
                      if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
                              error = EOPNOTSUPP;
                              goto out;
                      }
              }
              error = ifp->if_snd_tag_alloc(ifp, &params, mstp);
      out:
              if_rele(ifp);
              return (error);
      }
      
      static int
      ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
      {
              struct m_snd_tag *mst;
              int error;
      
              error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
              if (error == 0) {
                      tls->mode = TCP_TLS_MODE_IFNET;
                      tls->snd_tag = mst;
                      switch (tls->params.cipher_algorithm) {
                      case CRYPTO_AES_CBC:
                              counter_u64_add(ktls_ifnet_cbc, 1);
                              break;
                      case CRYPTO_AES_NIST_GCM_16:
                              counter_u64_add(ktls_ifnet_gcm, 1);
                              break;
                      }
              }
              return (error);
      }
      
      static int
      ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
      {
              struct rm_priotracker prio;
              struct ktls_crypto_backend *be;
      
              /*
               * Choose the best software crypto backend.  Backends are
               * stored in sorted priority order (larget value == most
               * important at the head of the list), so this just stops on
               * the first backend that claims the session by returning
               * success.
               */
              if (ktls_allow_unload)
                      rm_rlock(&ktls_backends_lock, &prio);
              LIST_FOREACH(be, &ktls_backends, next) {
                      if (be->try(so, tls, direction) == 0)
                              break;
                      KASSERT(tls->cipher == NULL,
                          ("ktls backend leaked a cipher pointer"));
              }
              if (be != NULL) {
                      if (ktls_allow_unload)
                              be->use_count++;
                      tls->be = be;
              }
              if (ktls_allow_unload)
                      rm_runlock(&ktls_backends_lock, &prio);
              if (be == NULL)
                      return (EOPNOTSUPP);
              tls->mode = TCP_TLS_MODE_SW;
              switch (tls->params.cipher_algorithm) {
              case CRYPTO_AES_CBC:
                      counter_u64_add(ktls_sw_cbc, 1);
                      break;
              case CRYPTO_AES_NIST_GCM_16:
                      counter_u64_add(ktls_sw_gcm, 1);
                      break;
              }
              return (0);
      }
      
      /*
       * KTLS RX stores data in the socket buffer as a list of TLS records,
       * where each record is stored as a control message containg the TLS
       * header followed by data mbufs containing the decrypted data.  This
       * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
       * both encrypted and decrypted data.  TLS records decrypted by a NIC
       * should be queued to the socket buffer as records, but encrypted
       * data which needs to be decrypted by software arrives as a stream of
       * regular mbufs which need to be converted.  In addition, there may
       * already be pending encrypted data in the socket buffer when KTLS RX
       * is enabled.
       *
       * To manage not-yet-decrypted data for KTLS RX, the following scheme
       * is used:
       *
       * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
       *
       * - ktls_check_rx checks this chain of mbufs reading the TLS header
       *   from the first mbuf.  Once all of the data for that TLS record is
       *   queued, the socket is queued to a worker thread.
       *
       * - The worker thread calls ktls_decrypt to decrypt TLS records in
       *   the TLS chain.  Each TLS record is detached from the TLS chain,
       *   decrypted, and inserted into the regular socket buffer chain as
       *   record starting with a control message holding the TLS header and
       *   a chain of mbufs holding the encrypted data.
       */
      
      static void
      sb_mark_notready(struct sockbuf *sb)
      {
              struct mbuf *m;
      
              m = sb->sb_mb;
              sb->sb_mtls = m;
              sb->sb_mb = NULL;
              sb->sb_mbtail = NULL;
              sb->sb_lastrecord = NULL;
              for (; m != NULL; m = m->m_next) {
                      KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
                          __func__));
                      KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
                          __func__));
                      KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
                          __func__));
                      m->m_flags |= M_NOTREADY;
                      sb->sb_acc -= m->m_len;
                      sb->sb_tlscc += m->m_len;
                      sb->sb_mtlstail = m;
              }
              KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
                  ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
                  sb->sb_ccc));
      }
      
      int
      ktls_enable_rx(struct socket *so, struct tls_enable *en)
    1 {
              struct ktls_session *tls;
              int error;
      
    1         if (!ktls_offload_enable)
                      return (ENOTSUP);
      
              counter_u64_add(ktls_offload_enable_calls, 1);
      
              /*
               * This should always be true since only the TCP socket option
               * invokes this function.
               */
              if (so->so_proto->pr_protocol != IPPROTO_TCP)
                      return (EINVAL);
      
              /*
               * XXX: Don't overwrite existing sessions.  We should permit
               * this to support rekeying in the future.
               */
              if (so->so_rcv.sb_tls_info != NULL)
                      return (EALREADY);
      
              if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
                      return (ENOTSUP);
      
              /* TLS 1.3 is not yet supported. */
              if (en->tls_vmajor == TLS_MAJOR_VER_ONE &&
                  en->tls_vminor == TLS_MINOR_VER_THREE)
                      return (ENOTSUP);
      
              error = ktls_create_session(so, en, &tls);
              if (error)
                      return (error);
      
      #ifdef TCP_OFFLOAD
              error = ktls_try_toe(so, tls, KTLS_RX);
              if (error)
      #endif
                      error = ktls_try_sw(so, tls, KTLS_RX);
      
              if (error) {
                      ktls_cleanup(tls);
                      return (error);
              }
      
              /* Mark the socket as using TLS offload. */
              SOCKBUF_LOCK(&so->so_rcv);
              so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
              so->so_rcv.sb_tls_info = tls;
              so->so_rcv.sb_flags |= SB_TLS_RX;
      
              /* Mark existing data as not ready until it can be decrypted. */
              sb_mark_notready(&so->so_rcv);
              ktls_check_rx(&so->so_rcv);
              SOCKBUF_UNLOCK(&so->so_rcv);
      
              counter_u64_add(ktls_offload_total, 1);
      
              return (0);
      }
      
      int
      ktls_enable_tx(struct socket *so, struct tls_enable *en)
    2 {
              struct ktls_session *tls;
              int error;
      
    2         if (!ktls_offload_enable)
                      return (ENOTSUP);
      
              counter_u64_add(ktls_offload_enable_calls, 1);
      
              /*
               * This should always be true since only the TCP socket option
               * invokes this function.
               */
              if (so->so_proto->pr_protocol != IPPROTO_TCP)
                      return (EINVAL);
      
              /*
               * XXX: Don't overwrite existing sessions.  We should permit
               * this to support rekeying in the future.
               */
              if (so->so_snd.sb_tls_info != NULL)
                      return (EALREADY);
      
              if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
                      return (ENOTSUP);
      
              /* TLS requires ext pgs */
              if (mb_use_ext_pgs == 0)
                      return (ENXIO);
      
              error = ktls_create_session(so, en, &tls);
              if (error)
                      return (error);
      
              /* Prefer TOE -> ifnet TLS -> software TLS. */
      #ifdef TCP_OFFLOAD
              error = ktls_try_toe(so, tls, KTLS_TX);
              if (error)
      #endif
                      error = ktls_try_ifnet(so, tls, false);
              if (error)
                      error = ktls_try_sw(so, tls, KTLS_TX);
      
              if (error) {
                      ktls_cleanup(tls);
                      return (error);
              }
      
              error = sblock(&so->so_snd, SBL_WAIT);
              if (error) {
                      ktls_cleanup(tls);
                      return (error);
              }
      
              SOCKBUF_LOCK(&so->so_snd);
              so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
              so->so_snd.sb_tls_info = tls;
              if (tls->mode != TCP_TLS_MODE_SW)
                      so->so_snd.sb_flags |= SB_TLS_IFNET;
              SOCKBUF_UNLOCK(&so->so_snd);
              sbunlock(&so->so_snd);
      
              counter_u64_add(ktls_offload_total, 1);
      
              return (0);
      }
      
      int
      ktls_get_rx_mode(struct socket *so)
    1 {
              struct ktls_session *tls;
              struct inpcb *inp;
              int mode;
      
              inp = so->so_pcb;
              INP_WLOCK_ASSERT(inp);
              SOCKBUF_LOCK(&so->so_rcv);
              tls = so->so_rcv.sb_tls_info;
    1         if (tls == NULL)
                      mode = TCP_TLS_MODE_NONE;
              else
                      mode = tls->mode;
              SOCKBUF_UNLOCK(&so->so_rcv);
              return (mode);
      }
      
      int
      ktls_get_tx_mode(struct socket *so)
    1 {
              struct ktls_session *tls;
              struct inpcb *inp;
              int mode;
      
              inp = so->so_pcb;
              INP_WLOCK_ASSERT(inp);
              SOCKBUF_LOCK(&so->so_snd);
              tls = so->so_snd.sb_tls_info;
    1         if (tls == NULL)
                      mode = TCP_TLS_MODE_NONE;
              else
                      mode = tls->mode;
              SOCKBUF_UNLOCK(&so->so_snd);
              return (mode);
      }
      
      /*
       * Switch between SW and ifnet TLS sessions as requested.
       */
      int
      ktls_set_tx_mode(struct socket *so, int mode)
    2 {
              struct ktls_session *tls, *tls_new;
              struct inpcb *inp;
              int error;
      
    1         switch (mode) {
              case TCP_TLS_MODE_SW:
              case TCP_TLS_MODE_IFNET:
                      break;
              default:
                      return (EINVAL);
              }
      
              inp = so->so_pcb;
              INP_WLOCK_ASSERT(inp);
              SOCKBUF_LOCK(&so->so_snd);
              tls = so->so_snd.sb_tls_info;
              if (tls == NULL) {
    1                 SOCKBUF_UNLOCK(&so->so_snd);
                      return (0);
              }
      
              if (tls->mode == mode) {
                      SOCKBUF_UNLOCK(&so->so_snd);
                      return (0);
              }
      
              tls = ktls_hold(tls);
              SOCKBUF_UNLOCK(&so->so_snd);
              INP_WUNLOCK(inp);
      
              tls_new = ktls_clone_session(tls);
      
              if (mode == TCP_TLS_MODE_IFNET)
                      error = ktls_try_ifnet(so, tls_new, true);
              else
                      error = ktls_try_sw(so, tls_new, KTLS_TX);
              if (error) {
                      counter_u64_add(ktls_switch_failed, 1);
                      ktls_free(tls_new);
                      ktls_free(tls);
                      INP_WLOCK(inp);
                      return (error);
              }
      
              error = sblock(&so->so_snd, SBL_WAIT);
              if (error) {
                      counter_u64_add(ktls_switch_failed, 1);
                      ktls_free(tls_new);
                      ktls_free(tls);
                      INP_WLOCK(inp);
                      return (error);
              }
      
              /*
               * If we raced with another session change, keep the existing
               * session.
               */
              if (tls != so->so_snd.sb_tls_info) {
                      counter_u64_add(ktls_switch_failed, 1);
                      sbunlock(&so->so_snd);
                      ktls_free(tls_new);
                      ktls_free(tls);
                      INP_WLOCK(inp);
                      return (EBUSY);
              }
      
              SOCKBUF_LOCK(&so->so_snd);
              so->so_snd.sb_tls_info = tls_new;
              if (tls_new->mode != TCP_TLS_MODE_SW)
                      so->so_snd.sb_flags |= SB_TLS_IFNET;
              SOCKBUF_UNLOCK(&so->so_snd);
              sbunlock(&so->so_snd);
      
              /*
               * Drop two references on 'tls'.  The first is for the
               * ktls_hold() above.  The second drops the reference from the
               * socket buffer.
               */
              KASSERT(tls->refcount >= 2, ("too few references on old session"));
              ktls_free(tls);
              ktls_free(tls);
      
              if (mode == TCP_TLS_MODE_IFNET)
                      counter_u64_add(ktls_switch_to_ifnet, 1);
              else
                      counter_u64_add(ktls_switch_to_sw, 1);
      
              INP_WLOCK(inp);
              return (0);
      }
      
      /*
       * Try to allocate a new TLS send tag.  This task is scheduled when
       * ip_output detects a route change while trying to transmit a packet
       * holding a TLS record.  If a new tag is allocated, replace the tag
       * in the TLS session.  Subsequent packets on the connection will use
       * the new tag.  If a new tag cannot be allocated, drop the
       * connection.
       */
      static void
      ktls_reset_send_tag(void *context, int pending)
      {
              struct epoch_tracker et;
              struct ktls_session *tls;
              struct m_snd_tag *old, *new;
              struct inpcb *inp;
              struct tcpcb *tp;
              int error;
      
              MPASS(pending == 1);
      
              tls = context;
              inp = tls->inp;
      
              /*
               * Free the old tag first before allocating a new one.
               * ip[6]_output_send() will treat a NULL send tag the same as
               * an ifp mismatch and drop packets until a new tag is
               * allocated.
               *
               * Write-lock the INP when changing tls->snd_tag since
               * ip[6]_output_send() holds a read-lock when reading the
               * pointer.
               */
              INP_WLOCK(inp);
              old = tls->snd_tag;
              tls->snd_tag = NULL;
              INP_WUNLOCK(inp);
              if (old != NULL)
                      m_snd_tag_rele(old);
      
              error = ktls_alloc_snd_tag(inp, tls, true, &new);
      
              if (error == 0) {
                      INP_WLOCK(inp);
                      tls->snd_tag = new;
                      mtx_pool_lock(mtxpool_sleep, tls);
                      tls->reset_pending = false;
                      mtx_pool_unlock(mtxpool_sleep, tls);
                      if (!in_pcbrele_wlocked(inp))
                              INP_WUNLOCK(inp);
      
                      counter_u64_add(ktls_ifnet_reset, 1);
      
                      /*
                       * XXX: Should we kick tcp_output explicitly now that
                       * the send tag is fixed or just rely on timers?
                       */
              } else {
                      NET_EPOCH_ENTER(et);
                      INP_WLOCK(inp);
                      if (!in_pcbrele_wlocked(inp)) {
                              if (!(inp->inp_flags & INP_TIMEWAIT) &&
                                  !(inp->inp_flags & INP_DROPPED)) {
                                      tp = intotcpcb(inp);
                                      CURVNET_SET(tp->t_vnet);
                                      tp = tcp_drop(tp, ECONNABORTED);
                                      CURVNET_RESTORE();
                                      if (tp != NULL)
                                              INP_WUNLOCK(inp);
                                      counter_u64_add(ktls_ifnet_reset_dropped, 1);
                              } else
                                      INP_WUNLOCK(inp);
                      }
                      NET_EPOCH_EXIT(et);
      
                      counter_u64_add(ktls_ifnet_reset_failed, 1);
      
                      /*
                       * Leave reset_pending true to avoid future tasks while
                       * the socket goes away.
                       */
              }
      
              ktls_free(tls);
      }
      
      int
      ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
      {
      
              if (inp == NULL)
                      return (ENOBUFS);
      
              INP_LOCK_ASSERT(inp);
      
              /*
               * See if we should schedule a task to update the send tag for
               * this session.
               */
              mtx_pool_lock(mtxpool_sleep, tls);
              if (!tls->reset_pending) {
                      (void) ktls_hold(tls);
                      in_pcbref(inp);
                      tls->inp = inp;
                      tls->reset_pending = true;
                      taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
              }
              mtx_pool_unlock(mtxpool_sleep, tls);
              return (ENOBUFS);
      }
      #endif
      
      void
      ktls_destroy(struct ktls_session *tls)
      {
              struct rm_priotracker prio;
      
              ktls_cleanup(tls);
              if (tls->be != NULL && ktls_allow_unload) {
                      rm_rlock(&ktls_backends_lock, &prio);
                      tls->be->use_count--;
                      rm_runlock(&ktls_backends_lock, &prio);
              }
              uma_zfree(ktls_session_zone, tls);
      }
      
      void
      ktls_seq(struct sockbuf *sb, struct mbuf *m)
      {
      
              for (; m != NULL; m = m->m_next) {
                      KASSERT((m->m_flags & M_EXTPG) != 0,
                          ("ktls_seq: mapped mbuf %p", m));
      
                      m->m_epg_seqno = sb->sb_tls_seqno;
                      sb->sb_tls_seqno++;
              }
      }
      
      /*
       * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
       * mbuf in the chain must be an unmapped mbuf.  The payload of the
       * mbuf must be populated with the payload of each TLS record.
       *
       * The record_type argument specifies the TLS record type used when
       * populating the TLS header.
       *
       * The enq_count argument on return is set to the number of pages of
       * payload data for this entire chain that need to be encrypted via SW
       * encryption.  The returned value should be passed to ktls_enqueue
       * when scheduling encryption of this chain of mbufs.
       */
      void
      ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
          uint8_t record_type)
      {
              struct tls_record_layer *tlshdr;
              struct mbuf *m;
              uint64_t *noncep;
              uint16_t tls_len;
              int maxlen;
      
              maxlen = tls->params.max_frame_len;
              *enq_cnt = 0;
              for (m = top; m != NULL; m = m->m_next) {
                      /*
                       * All mbufs in the chain should be non-empty TLS
                       * records whose payload does not exceed the maximum
                       * frame length.
                       */
                      KASSERT(m->m_len <= maxlen && m->m_len > 0,
                          ("ktls_frame: m %p len %d\n", m, m->m_len));
                      /*
                       * TLS frames require unmapped mbufs to store session
                       * info.
                       */
                      KASSERT((m->m_flags & M_EXTPG) != 0,
                          ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
      
                      tls_len = m->m_len;
      
                      /* Save a reference to the session. */
                      m->m_epg_tls = ktls_hold(tls);
      
                      m->m_epg_hdrlen = tls->params.tls_hlen;
                      m->m_epg_trllen = tls->params.tls_tlen;
                      if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
                              int bs, delta;
      
                              /*
                               * AES-CBC pads messages to a multiple of the
                               * block size.  Note that the padding is
                               * applied after the digest and the encryption
                               * is done on the "plaintext || mac || padding".
                               * At least one byte of padding is always
                               * present.
                               *
                               * Compute the final trailer length assuming
                               * at most one block of padding.
                               * tls->params.sb_tls_tlen is the maximum
                               * possible trailer length (padding + digest).
                               * delta holds the number of excess padding
                               * bytes if the maximum were used.  Those
                               * extra bytes are removed.
                               */
                              bs = tls->params.tls_bs;
                              delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
                              m->m_epg_trllen -= delta;
                      }
                      m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
      
                      /* Populate the TLS header. */
                      tlshdr = (void *)m->m_epg_hdr;
                      tlshdr->tls_vmajor = tls->params.tls_vmajor;
      
                      /*
                       * TLS 1.3 masquarades as TLS 1.2 with a record type
                       * of TLS_RLTYPE_APP.
                       */
                      if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
                          tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
                              tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
                              tlshdr->tls_type = TLS_RLTYPE_APP;
                              /* save the real record type for later */
                              m->m_epg_record_type = record_type;
                              m->m_epg_trail[0] = record_type;
                      } else {
                              tlshdr->tls_vminor = tls->params.tls_vminor;
                              tlshdr->tls_type = record_type;
                      }
                      tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
      
                      /*
                       * Store nonces / explicit IVs after the end of the
                       * TLS header.
                       *
                       * For GCM with TLS 1.2, an 8 byte nonce is copied
                       * from the end of the IV.  The nonce is then
                       * incremented for use by the next record.
                       *
                       * For CBC, a random nonce is inserted for TLS 1.1+.
                       */
                      if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
                          tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
                              noncep = (uint64_t *)(tls->params.iv + 8);
                              be64enc(tlshdr + 1, *noncep);
                              (*noncep)++;
                      } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
                          tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
                              arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
      
                      /*
                       * When using SW encryption, mark the mbuf not ready.
                       * It will be marked ready via sbready() after the
                       * record has been encrypted.
                       *
                       * When using ifnet TLS, unencrypted TLS records are
                       * sent down the stack to the NIC.
                       */
                      if (tls->mode == TCP_TLS_MODE_SW) {
                              m->m_flags |= M_NOTREADY;
                              m->m_epg_nrdy = m->m_epg_npgs;
                              *enq_cnt += m->m_epg_npgs;
                      }
              }
      }
      
      void
      ktls_check_rx(struct sockbuf *sb)
      {
              struct tls_record_layer hdr;
              struct ktls_wq *wq;
              struct socket *so;
              bool running;
      
              SOCKBUF_LOCK_ASSERT(sb);
              KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
                  __func__, sb));
              so = __containerof(sb, struct socket, so_rcv);
      
              if (sb->sb_flags & SB_TLS_RX_RUNNING)
                      return;
      
              /* Is there enough queued for a TLS header? */
              if (sb->sb_tlscc < sizeof(hdr)) {
                      if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
                              so->so_error = EMSGSIZE;
                      return;
              }
      
              m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
      
              /* Is the entire record queued? */
              if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
                      if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
                              so->so_error = EMSGSIZE;
                      return;
              }
      
              sb->sb_flags |= SB_TLS_RX_RUNNING;
      
              soref(so);
              wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
              mtx_lock(&wq->mtx);
              STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
              running = wq->running;
              mtx_unlock(&wq->mtx);
              if (!running)
                      wakeup(wq);
              counter_u64_add(ktls_cnt_rx_queued, 1);
      }
      
      static struct mbuf *
      ktls_detach_record(struct sockbuf *sb, int len)
      {
              struct mbuf *m, *n, *top;
              int remain;
      
              SOCKBUF_LOCK_ASSERT(sb);
              MPASS(len <= sb->sb_tlscc);
      
              /*
               * If TLS chain is the exact size of the record,
               * just grab the whole record.
               */
              top = sb->sb_mtls;
              if (sb->sb_tlscc == len) {
                      sb->sb_mtls = NULL;
                      sb->sb_mtlstail = NULL;
                      goto out;
              }
      
              /*
               * While it would be nice to use m_split() here, we need
               * to know exactly what m_split() allocates to update the
               * accounting, so do it inline instead.
               */
              remain = len;
              for (m = top; remain > m->m_len; m = m->m_next)
                      remain -= m->m_len;
      
              /* Easy case: don't have to split 'm'. */
              if (remain == m->m_len) {
                      sb->sb_mtls = m->m_next;
                      if (sb->sb_mtls == NULL)
                              sb->sb_mtlstail = NULL;
                      m->m_next = NULL;
                      goto out;
              }
      
              /*
               * Need to allocate an mbuf to hold the remainder of 'm'.  Try
               * with M_NOWAIT first.
               */
              n = m_get(M_NOWAIT, MT_DATA);
              if (n == NULL) {
                      /*
                       * Use M_WAITOK with socket buffer unlocked.  If
                       * 'sb_mtls' changes while the lock is dropped, return
                       * NULL to force the caller to retry.
                       */
                      SOCKBUF_UNLOCK(sb);
      
                      n = m_get(M_WAITOK, MT_DATA);
      
                      SOCKBUF_LOCK(sb);
                      if (sb->sb_mtls != top) {
                              m_free(n);
                              return (NULL);
                      }
              }
              n->m_flags |= M_NOTREADY;
      
              /* Store remainder in 'n'. */
              n->m_len = m->m_len - remain;
              if (m->m_flags & M_EXT) {
                      n->m_data = m->m_data + remain;
                      mb_dupcl(n, m);
              } else {
                      bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
              }
      
              /* Trim 'm' and update accounting. */
              m->m_len -= n->m_len;
              sb->sb_tlscc -= n->m_len;
              sb->sb_ccc -= n->m_len;
      
              /* Account for 'n'. */
              sballoc_ktls_rx(sb, n);
      
              /* Insert 'n' into the TLS chain. */
              sb->sb_mtls = n;
              n->m_next = m->m_next;
              if (sb->sb_mtlstail == m)
                      sb->sb_mtlstail = n;
      
              /* Detach the record from the TLS chain. */
              m->m_next = NULL;
      
      out:
              MPASS(m_length(top, NULL) == len);
              for (m = top; m != NULL; m = m->m_next)
                      sbfree_ktls_rx(sb, m);
              sb->sb_tlsdcc = len;
              sb->sb_ccc += len;
              SBCHECK(sb);
              return (top);
      }
      
      static void
      ktls_decrypt(struct socket *so)
      {
              char tls_header[MBUF_PEXT_HDR_LEN];
              struct ktls_session *tls;
              struct sockbuf *sb;
              struct tls_record_layer *hdr;
              struct tls_get_record tgr;
              struct mbuf *control, *data, *m;
              uint64_t seqno;
              int error, remain, tls_len, trail_len;
      
              hdr = (struct tls_record_layer *)tls_header;
              sb = &so->so_rcv;
              SOCKBUF_LOCK(sb);
              KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
                  ("%s: socket %p not running", __func__, so));
      
              tls = sb->sb_tls_info;
              MPASS(tls != NULL);
      
              for (;;) {
                      /* Is there enough queued for a TLS header? */
                      if (sb->sb_tlscc < tls->params.tls_hlen)
                              break;
      
                      m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
                      tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
      
                      if (hdr->tls_vmajor != tls->params.tls_vmajor ||
                          hdr->tls_vminor != tls->params.tls_vminor)
                              error = EINVAL;
                      else if (tls_len < tls->params.tls_hlen || tls_len >
                          tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
                          tls->params.tls_tlen)
                              error = EMSGSIZE;
                      else
                              error = 0;
                      if (__predict_false(error != 0)) {
                              /*
                               * We have a corrupted record and are likely
                               * out of sync.  The connection isn't
                               * recoverable at this point, so abort it.
                               */
                              SOCKBUF_UNLOCK(sb);
                              counter_u64_add(ktls_offload_corrupted_records, 1);
      
                              CURVNET_SET(so->so_vnet);
                              so->so_proto->pr_usrreqs->pru_abort(so);
                              so->so_error = error;
                              CURVNET_RESTORE();
                              goto deref;
                      }
      
                      /* Is the entire record queued? */
                      if (sb->sb_tlscc < tls_len)
                              break;
      
                      /*
                       * Split out the portion of the mbuf chain containing
                       * this TLS record.
                       */
                      data = ktls_detach_record(sb, tls_len);
                      if (data == NULL)
                              continue;
                      MPASS(sb->sb_tlsdcc == tls_len);
      
                      seqno = sb->sb_tls_seqno;
                      sb->sb_tls_seqno++;
                      SBCHECK(sb);
                      SOCKBUF_UNLOCK(sb);
      
                      error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len);
                      if (error) {
                              counter_u64_add(ktls_offload_failed_crypto, 1);
      
                              SOCKBUF_LOCK(sb);
                              if (sb->sb_tlsdcc == 0) {
                                      /*
                                       * sbcut/drop/flush discarded these
                                       * mbufs.
                                       */
                                      m_freem(data);
                                      break;
                              }
      
                              /*
                               * Drop this TLS record's data, but keep
                               * decrypting subsequent records.
                               */
                              sb->sb_ccc -= tls_len;
                              sb->sb_tlsdcc = 0;
      
                              CURVNET_SET(so->so_vnet);
                              so->so_error = EBADMSG;
                              sorwakeup_locked(so);
                              CURVNET_RESTORE();
      
                              m_freem(data);
      
                              SOCKBUF_LOCK(sb);
                              continue;
                      }
      
                      /* Allocate the control mbuf. */
                      tgr.tls_type = hdr->tls_type;
                      tgr.tls_vmajor = hdr->tls_vmajor;
                      tgr.tls_vminor = hdr->tls_vminor;
                      tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
                          trail_len);
                      control = sbcreatecontrol_how(&tgr, sizeof(tgr),
                          TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
      
                      SOCKBUF_LOCK(sb);
                      if (sb->sb_tlsdcc == 0) {
                              /* sbcut/drop/flush discarded these mbufs. */
                              MPASS(sb->sb_tlscc == 0);
                              m_freem(data);
                              m_freem(control);
                              break;
                      }
      
                      /*
                       * Clear the 'dcc' accounting in preparation for
                       * adding the decrypted record.
                       */
                      sb->sb_ccc -= tls_len;
                      sb->sb_tlsdcc = 0;
                      SBCHECK(sb);
      
                      /* If there is no payload, drop all of the data. */
                      if (tgr.tls_length == htobe16(0)) {
                              m_freem(data);
                              data = NULL;
                      } else {
                              /* Trim header. */
                              remain = tls->params.tls_hlen;
                              while (remain > 0) {
                                      if (data->m_len > remain) {
                                              data->m_data += remain;
                                              data->m_len -= remain;
                                              break;
                                      }
                                      remain -= data->m_len;
                                      data = m_free(data);
                              }
      
                              /* Trim trailer and clear M_NOTREADY. */
                              remain = be16toh(tgr.tls_length);
                              m = data;
                              for (m = data; remain > m->m_len; m = m->m_next) {
                                      m->m_flags &= ~M_NOTREADY;
                                      remain -= m->m_len;
                              }
                              m->m_len = remain;
                              m_freem(m->m_next);
                              m->m_next = NULL;
                              m->m_flags &= ~M_NOTREADY;
      
                              /* Set EOR on the final mbuf. */
                              m->m_flags |= M_EOR;
                      }
      
                      sbappendcontrol_locked(sb, data, control, 0);
              }
      
              sb->sb_flags &= ~SB_TLS_RX_RUNNING;
      
              if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
                      so->so_error = EMSGSIZE;
      
              sorwakeup_locked(so);
      
      deref:
              SOCKBUF_UNLOCK_ASSERT(sb);
      
              CURVNET_SET(so->so_vnet);
              SOCK_LOCK(so);
              sorele(so);
              CURVNET_RESTORE();
      }
      
      void
      ktls_enqueue_to_free(struct mbuf *m)
      {
              struct ktls_wq *wq;
              bool running;
      
              /* Mark it for freeing. */
              m->m_epg_flags |= EPG_FLAG_2FREE;
              wq = &ktls_wq[m->m_epg_tls->wq_index];
              mtx_lock(&wq->mtx);
              STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
              running = wq->running;
              mtx_unlock(&wq->mtx);
              if (!running)
                      wakeup(wq);
      }
      
      void
      ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
      {
              struct ktls_wq *wq;
              bool running;
      
              KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
                  (M_EXTPG | M_NOTREADY)),
                  ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
              KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
      
              KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
      
              m->m_epg_enc_cnt = page_count;
      
              /*
               * Save a pointer to the socket.  The caller is responsible
               * for taking an additional reference via soref().
               */
              m->m_epg_so = so;
      
              wq = &ktls_wq[m->m_epg_tls->wq_index];
              mtx_lock(&wq->mtx);
              STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
              running = wq->running;
              mtx_unlock(&wq->mtx);
              if (!running)
                      wakeup(wq);
              counter_u64_add(ktls_cnt_tx_queued, 1);
      }
      
      static __noinline void
      ktls_encrypt(struct mbuf *top)
      {
              struct ktls_session *tls;
              struct socket *so;
              struct mbuf *m;
              vm_paddr_t parray[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
              struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
              struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
              vm_page_t pg;
              int error, i, len, npages, off, total_pages;
              bool is_anon;
      
              so = top->m_epg_so;
              tls = top->m_epg_tls;
              KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
              KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
      #ifdef INVARIANTS
              top->m_epg_so = NULL;
      #endif
              total_pages = top->m_epg_enc_cnt;
              npages = 0;
      
              /*
               * Encrypt the TLS records in the chain of mbufs starting with
               * 'top'.  'total_pages' gives us a total count of pages and is
               * used to know when we have finished encrypting the TLS
               * records originally queued with 'top'.
               *
               * NB: These mbufs are queued in the socket buffer and
               * 'm_next' is traversing the mbufs in the socket buffer.  The
               * socket buffer lock is not held while traversing this chain.
               * Since the mbufs are all marked M_NOTREADY their 'm_next'
               * pointers should be stable.  However, the 'm_next' of the
               * last mbuf encrypted is not necessarily NULL.  It can point
               * to other mbufs appended while 'top' was on the TLS work
               * queue.
               *
               * Each mbuf holds an entire TLS record.
               */
              error = 0;
              for (m = top; npages != total_pages; m = m->m_next) {
                      KASSERT(m->m_epg_tls == tls,
                          ("different TLS sessions in a single mbuf chain: %p vs %p",
                          tls, m->m_epg_tls));
                      KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
                          (M_EXTPG | M_NOTREADY),
                          ("%p not unready & nomap mbuf (top = %p)\n", m, top));
                      KASSERT(npages + m->m_epg_npgs <= total_pages,
                          ("page count mismatch: top %p, total_pages %d, m %p", top,
                          total_pages, m));
      
                      /*
                       * Generate source and destination ivoecs to pass to
                       * the SW encryption backend.  For writable mbufs, the
                       * destination iovec is a copy of the source and
                       * encryption is done in place.  For file-backed mbufs
                       * (from sendfile), anonymous wired pages are
                       * allocated and assigned to the destination iovec.
                       */
                      is_anon = (m->m_epg_flags & EPG_FLAG_ANON) != 0;
      
                      off = m->m_epg_1st_off;
                      for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
                              len = m_epg_pagelen(m, i, off);
                              src_iov[i].iov_len = len;
                              src_iov[i].iov_base =
                                  (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) +
                                      off;
      
                              if (is_anon) {
                                      dst_iov[i].iov_base = src_iov[i].iov_base;
                                      dst_iov[i].iov_len = src_iov[i].iov_len;
                                      continue;
                              }
      retry_page:
                              pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
                                  VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED);
                              if (pg == NULL) {
                                      vm_wait(NULL);
                                      goto retry_page;
                              }
                              parray[i] = VM_PAGE_TO_PHYS(pg);
                              dst_iov[i].iov_base =
                                  (char *)(void *)PHYS_TO_DMAP(parray[i]) + off;
                              dst_iov[i].iov_len = len;
                      }
      
                      npages += i;
      
                      error = (*tls->sw_encrypt)(tls,
                          (const struct tls_record_layer *)m->m_epg_hdr,
                          m->m_epg_trail, src_iov, dst_iov, i, m->m_epg_seqno,
                          m->m_epg_record_type);
                      if (error) {
                              counter_u64_add(ktls_offload_failed_crypto, 1);
                              break;
                      }
      
                      /*
                       * For file-backed mbufs, release the file-backed
                       * pages and replace them in the ext_pgs array with
                       * the anonymous wired pages allocated above.
                       */
                      if (!is_anon) {
                              /* Free the old pages. */
                              m->m_ext.ext_free(m);
      
                              /* Replace them with the new pages. */
                              for (i = 0; i < m->m_epg_npgs; i++)
                                      m->m_epg_pa[i] = parray[i];
      
                              /* Use the basic free routine. */
                              m->m_ext.ext_free = mb_free_mext_pgs;
      
                              /* Pages are now writable. */
                              m->m_epg_flags |= EPG_FLAG_ANON;
                      }
      
                      /*
                       * Drop a reference to the session now that it is no
                       * longer needed.  Existing code depends on encrypted
                       * records having no associated session vs
                       * yet-to-be-encrypted records having an associated
                       * session.
                       */
                      m->m_epg_tls = NULL;
                      ktls_free(tls);
              }
      
              CURVNET_SET(so->so_vnet);
              if (error == 0) {
                      (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
              } else {
                      so->so_proto->pr_usrreqs->pru_abort(so);
                      so->so_error = EIO;
                      mb_free_notready(top, total_pages);
              }
      
              SOCK_LOCK(so);
              sorele(so);
              CURVNET_RESTORE();
      }
      
      static void
      ktls_work_thread(void *ctx)
      {
              struct ktls_wq *wq = ctx;
              struct mbuf *m, *n;
              struct socket *so, *son;
              STAILQ_HEAD(, mbuf) local_m_head;
              STAILQ_HEAD(, socket) local_so_head;
      
      #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
              fpu_kern_thread(0);
      #endif
              for (;;) {
                      mtx_lock(&wq->mtx);
                      while (STAILQ_EMPTY(&wq->m_head) &&
                          STAILQ_EMPTY(&wq->so_head)) {
                              wq->running = false;
                              mtx_sleep(wq, &wq->mtx, 0, "-", 0);
                              wq->running = true;
                      }
      
                      STAILQ_INIT(&local_m_head);
                      STAILQ_CONCAT(&local_m_head, &wq->m_head);
                      STAILQ_INIT(&local_so_head);
                      STAILQ_CONCAT(&local_so_head, &wq->so_head);
                      mtx_unlock(&wq->mtx);
      
                      STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
                              if (m->m_epg_flags & EPG_FLAG_2FREE) {
                                      ktls_free(m->m_epg_tls);
                                      uma_zfree(zone_mbuf, m);
                              } else {
                                      ktls_encrypt(m);
                                      counter_u64_add(ktls_cnt_tx_queued, -1);
                              }
                      }
      
                      STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
                              ktls_decrypt(so);
                              counter_u64_add(ktls_cnt_rx_queued, -1);
                      }
              }
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2020 Alexander V. Chernikov
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      #include "opt_inet.h"
      #include "opt_route.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/lock.h>
      #include <sys/rwlock.h>
      #include <sys/malloc.h>
      #include <sys/mbuf.h>
      #include <sys/socket.h>
      #include <sys/kernel.h>
      
      #include <net/if.h>
      #include <net/if_var.h>
      #include <net/route.h>
      #include <net/route/route_var.h>
      #include <net/route/nhop_utils.h>
      #include <net/route/nhop.h>
      #include <net/route/nhop_var.h>
      #include <net/vnet.h>
      
      /*
       * This file contains data structures management logic for the nexthop ("nhop")
       *   route subsystem.
       *
       * Nexthops in the original sense are the objects containing all the necessary
       * information to forward the packet to the selected destination.
       * In particular, nexthop is defined by a combination of
       *  ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
       *    NHF_DEFAULT
       *
       * All nexthops are stored in the resizable hash table.
       * Additionally, each nexthop gets assigned its unique index (nexthop index)
       * so userland programs can interact with the nexthops easier. Index allocation
       * is backed by the bitmask array.
       */
      
      static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
      
      /* Hash management functions */
      
      int
      nhops_init_rib(struct rib_head *rh)
      {
              struct nh_control *ctl;
              size_t alloc_size;
              uint32_t num_buckets, num_items;
              void *ptr;
      
              ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO);
      
              /*
               * Allocate nexthop hash. Start with 16 items by default (128 bytes).
               * This will be enough for most of the cases.
               */
              num_buckets = 16;
              alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
              ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO);
              CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets);
      
              /*
               * Allocate nexthop index bitmask.
               */
              num_items = 128 * 8; /* 128 bytes */
              ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO);
              bitmask_init(&ctl->nh_idx_head, ptr, num_items);
      
              NHOPS_LOCK_INIT(ctl);
      
              rh->nh_control = ctl;
              ctl->ctl_rh = rh;
      
              DPRINTF("NHOPS init for fib %u af %u: ctl %p rh %p", rh->rib_fibnum,
                  rh->rib_family, ctl, rh);
      
              return (0);
      }
      
      static void
      destroy_ctl(struct nh_control *ctl)
      {
      
              NHOPS_LOCK_DESTROY(ctl);
              free(ctl->nh_head.ptr, M_NHOP);
              free(ctl->nh_idx_head.idx, M_NHOP);
              free(ctl, M_NHOP);
      }
      
      /*
       * Epoch callback indicating ctl is safe to destroy
       */
      static void
      destroy_ctl_epoch(epoch_context_t ctx)
      {
              struct nh_control *ctl;
      
              ctl = __containerof(ctx, struct nh_control, ctl_epoch_ctx);
      
              destroy_ctl(ctl);
      }
      
      void
      nhops_destroy_rib(struct rib_head *rh)
      {
              struct nh_control *ctl;
              struct nhop_priv *nh_priv;
      
              ctl = rh->nh_control;
      
              /*
               * All routes should have been deleted in rt_table_destroy().
               * However, TCP stack or other consumers may store referenced
               *  nexthop pointers. When these references go to zero,
               *  nhop_free() will try to unlink these records from the
               *  datastructures, most likely leading to panic.
               *
               * Avoid that by explicitly marking all of the remaining
               *  nexthops as unlinked by removing a reference from a special
               *  counter. Please see nhop_free() comments for more
               *  details.
               */
      
              NHOPS_WLOCK(ctl);
              CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
                      DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx);
                      refcount_release(&nh_priv->nh_linked);
              } CHT_SLIST_FOREACH_END;
              NHOPS_WUNLOCK(ctl);
      
              /*
               * Postpone destruction till the end of current epoch
               * so nhop_free() can safely use nh_control pointer.
               */
              epoch_call(net_epoch_preempt, destroy_ctl_epoch,
                  &ctl->ctl_epoch_ctx);
      }
      
      /*
       * Nexhop hash calculation:
       *
       * Nexthops distribution:
       * 2 "mandatory" nexthops per interface ("interface route", "loopback").
       * For direct peering: 1 nexthop for the peering router per ifp/af.
       * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af.
       * IGP control plane & broadcast segment: tens of nexthops per ifp/af.
       *
       * Each fib/af combination has its own hash table.
       * With that in mind, hash nexthops by the combination of the interface
       *  and GW IP address.
       *
       * To optimize hash calculation, ignore higher bytes of ifindex, as they
       *  give very little entropy.
       * Similarly, use lower 4 bytes of IPv6 address to distinguish between the
       *  neighbors.
       */
      struct _hash_data {
              uint16_t        ifindex;
              uint8_t                family;
              uint8_t                nh_type;
              uint32_t        gw_addr;
      };
      
      static unsigned
      djb_hash(const unsigned char *h, const int len)
      {
              unsigned int result = 0;
              int i;
      
              for (i = 0; i < len; i++)
                      result = 33 * result ^ h[i];
      
              return (result);
      }
      
      static uint32_t
      hash_priv(const struct nhop_priv *priv)
      {
              struct nhop_object *nh;
              uint16_t ifindex;
              struct _hash_data key;
      
              nh = priv->nh;
              ifindex = nh->nh_ifp->if_index & 0xFFFF;
              memset(&key, 0, sizeof(key));
      
              key.ifindex = ifindex;
              key.family = nh->gw_sa.sa_family;
              key.nh_type = priv->nh_type & 0xFF;
              if (nh->gw_sa.sa_family == AF_INET6)
                      memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4);
              else if (nh->gw_sa.sa_family == AF_INET)
    4                 memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4);
      
              return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key)));
      }
      
      /*
       * Checks if hash needs resizing and performs this resize if necessary
       *
       */
      static void
      consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
      {
              void *nh_ptr, *nh_idx_ptr;
              void *old_idx_ptr;
              size_t alloc_size;
      
              nh_ptr = NULL;
              if (new_nh_buckets != 0) {
                      alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
                      nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
              }
      
              nh_idx_ptr = NULL;
              if (new_idx_items != 0) {
                      alloc_size = bitmask_get_size(new_idx_items);
                      nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
              }
      
              if (nh_ptr == NULL && nh_idx_ptr == NULL) {
                      /* Either resize is not required or allocations have failed. */
                      return;
              }
      
              DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr,
                  new_nh_buckets, nh_idx_ptr, new_idx_items);
      
              old_idx_ptr = NULL;
      
              NHOPS_WLOCK(ctl);
              if (nh_ptr != NULL) {
                      CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets);
              }
              if (nh_idx_ptr != NULL) {
                      if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items) == 0)
                              bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
              }
              NHOPS_WUNLOCK(ctl);
      
              if (nh_ptr != NULL)
                      free(nh_ptr, M_NHOP);
              if (old_idx_ptr != NULL)
                      free(old_idx_ptr, M_NHOP);
      }
      
      /*
       * Links nextop @nh_priv to the nexhop hash table and allocates
       *  nexhop index.
       * Returns allocated index or 0 on failure.
       */
      int
      link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv)
      {
              uint16_t idx;
              uint32_t num_buckets_new, num_items_new;
      
              KASSERT((nh_priv->nh_idx == 0), ("nhop index is already allocated"));
              NHOPS_WLOCK(ctl);
      
              /*
               * Check if we need to resize hash and index.
               * The following 2 functions returns either new size or 0
               *  if resize is not required.
               */
              num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
              num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head);
      
              if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) {
                      NHOPS_WUNLOCK(ctl);
                      DPRINTF("Unable to allocate nhop index");
                      RTSTAT_INC(rts_nh_idx_alloc_failure);
                      consider_resize(ctl, num_buckets_new, num_items_new);
                      return (0);
              }
      
              nh_priv->nh_idx = idx;
              nh_priv->nh_control = ctl;
      
              CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv);
      
              NHOPS_WUNLOCK(ctl);
      
              DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx,
                  hash_priv(nh_priv), ctl);
              consider_resize(ctl, num_buckets_new, num_items_new);
      
              return (idx);
      }
      
      /*
       * Unlinks nexthop specified by @nh_priv data from the hash.
       *
       * Returns found nexthop or NULL.
       */
      struct nhop_priv *
      unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv_del)
      {
              struct nhop_priv *priv_ret;
              int idx;
              uint32_t num_buckets_new, num_items_new;
      
              idx = 0;
      
              NHOPS_WLOCK(ctl);
              CHT_SLIST_REMOVE_BYOBJ(&ctl->nh_head, nhops, nh_priv_del, priv_ret);
      
              if (priv_ret != NULL) {
                      idx = priv_ret->nh_idx;
                      priv_ret->nh_idx = 0;
      
                      KASSERT((idx != 0), ("bogus nhop index 0"));
                      if ((bitmask_free_idx(&ctl->nh_idx_head, idx)) != 0) {
                              DPRINTF("Unable to remove index %d from fib %u af %d",
                                  idx, ctl->ctl_rh->rib_fibnum,
                                  ctl->ctl_rh->rib_family);
                      }
              }
      
              /* Check if hash or index needs to be resized */
              num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
              num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head);
      
              NHOPS_WUNLOCK(ctl);
      
              if (priv_ret == NULL)
                      DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p",
                          nh_priv_del, hash_priv(nh_priv_del), ctl);
              else
                      DPRINTF("Unlinked nhop %p priv idx %d", priv_ret, idx);
      
              consider_resize(ctl, num_buckets_new, num_items_new);
      
              return (priv_ret);
      }
      
      /*
       * Searches for the nexthop by data specifcied in @nh_priv.
       * Returns referenced nexthop or NULL.
       */
      struct nhop_priv *
      find_nhop(struct nh_control *ctl, const struct nhop_priv *nh_priv)
    4 {
              struct nhop_priv *nh_priv_ret;
      
              NHOPS_RLOCK(ctl);
    4         CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret);
              if (nh_priv_ret != NULL) {
                      if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){
                              /* refcount was 0 -> nhop is being deleted */
                              nh_priv_ret = NULL;
                      }
              }
              NHOPS_RUNLOCK(ctl);
      
              return (nh_priv_ret);
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       * $FreeBSD$
       */
      
      /*
       * Machine dependent interrupt code for x86.  For x86, we have to
       * deal with different PICs.  Thus, we use the passed in vector to lookup
       * an interrupt source associated with that vector.  The interrupt source
       * describes which PIC the source belongs to and includes methods to handle
       * that source.
       */
      
      #include "opt_atpic.h"
      #include "opt_ddb.h"
      #include "opt_smp.h"
      
      #include <sys/param.h>
      #include <sys/bus.h>
      #include <sys/interrupt.h>
      #include <sys/ktr.h>
      #include <sys/kernel.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mutex.h>
      #include <sys/proc.h>
      #include <sys/queue.h>
      #include <sys/sbuf.h>
      #include <sys/smp.h>
      #include <sys/sx.h>
      #include <sys/sysctl.h>
      #include <sys/syslog.h>
      #include <sys/systm.h>
      #include <sys/taskqueue.h>
      #include <sys/vmmeter.h>
      #include <machine/clock.h>
      #include <machine/intr_machdep.h>
      #include <machine/smp.h>
      #ifdef DDB
      #include <ddb/ddb.h>
      #endif
      
      #ifndef DEV_ATPIC
      #include <machine/segments.h>
      #include <machine/frame.h>
      #include <dev/ic/i8259.h>
      #include <x86/isa/icu.h>
      #include <isa/isareg.h>
      #endif
      
      #include <vm/vm.h>
      
      #define        MAX_STRAY_LOG        5
      
      typedef void (*mask_fn)(void *);
      
      static int intrcnt_index;
      static struct intsrc **interrupt_sources;
      #ifdef SMP
      static struct intsrc **interrupt_sorted;
      static int intrbalance;
      SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RWTUN, &intrbalance, 0,
          "Interrupt auto-balance interval (seconds).  Zero disables.");
      static struct timeout_task intrbalance_task;
      #endif
      static struct sx intrsrc_lock;
      static struct mtx intrpic_lock;
      static struct mtx intrcnt_lock;
      static TAILQ_HEAD(pics_head, pic) pics;
      u_int num_io_irqs;
      
      #if defined(SMP) && !defined(EARLY_AP_STARTUP)
      static int assign_cpu;
      #endif
      
      u_long *intrcnt;
      char *intrnames;
      size_t sintrcnt = sizeof(intrcnt);
      size_t sintrnames = sizeof(intrnames);
      int nintrcnt;
      
      static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
      
      static int        intr_assign_cpu(void *arg, int cpu);
      static void        intr_disable_src(void *arg);
      static void        intr_init(void *__dummy);
      static int        intr_pic_registered(struct pic *pic);
      static void        intrcnt_setname(const char *name, int index);
      static void        intrcnt_updatename(struct intsrc *is);
      static void        intrcnt_register(struct intsrc *is);
      
      /*
       * SYSINIT levels for SI_SUB_INTR:
       *
       * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
       * SI_ORDER_SECOND: Xen PICs
       * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
       * SI_ORDER_FOURTH: Add 8259A PICs
       * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
       * SI_ORDER_MIDDLE: SMP interrupt counters
       * SI_ORDER_ANY: Enable interrupts on BSP
       */
      
      static int
      intr_pic_registered(struct pic *pic)
      {
              struct pic *p;
      
              TAILQ_FOREACH(p, &pics, pics) {
                      if (p == pic)
                              return (1);
              }
              return (0);
      }
      
      /*
       * Register a new interrupt controller (PIC).  This is to support suspend
       * and resume where we suspend/resume controllers rather than individual
       * sources.  This also allows controllers with no active sources (such as
       * 8259As in a system using the APICs) to participate in suspend and resume.
       */
      int
      intr_register_pic(struct pic *pic)
      {
              int error;
      
              mtx_lock(&intrpic_lock);
              if (intr_pic_registered(pic))
                      error = EBUSY;
              else {
                      TAILQ_INSERT_TAIL(&pics, pic, pics);
                      error = 0;
              }
              mtx_unlock(&intrpic_lock);
              return (error);
      }
      
      /*
       * Allocate interrupt source arrays and register interrupt sources
       * once the number of interrupts is known.
       */
      static void
      intr_init_sources(void *arg)
      {
              struct pic *pic;
      
              MPASS(num_io_irqs > 0);
      
              interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
                  M_INTR, M_WAITOK | M_ZERO);
      #ifdef SMP
              interrupt_sorted = mallocarray(num_io_irqs, sizeof(*interrupt_sorted),
                  M_INTR, M_WAITOK | M_ZERO);
      #endif
      
              /*
               * - 1 ??? dummy counter.
               * - 2 counters for each I/O interrupt.
               * - 1 counter for each CPU for lapic timer.
               * - 1 counter for each CPU for the Hyper-V vmbus driver.
               * - 8 counters for each CPU for IPI counters for SMP.
               */
              nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
      #ifdef COUNT_IPIS
              if (mp_ncpus > 1)
                      nintrcnt += 8 * mp_ncpus;
      #endif
              intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
                  M_ZERO);
              intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK |
                  M_ZERO);
              sintrcnt = nintrcnt * sizeof(u_long);
              sintrnames = nintrcnt * (MAXCOMLEN + 1);
      
              intrcnt_setname("???", 0);
              intrcnt_index = 1;
      
              /*
               * NB: intrpic_lock is not held here to avoid LORs due to
               * malloc() in intr_register_source().  However, we are still
               * single-threaded at this point in startup so the list of
               * PICs shouldn't change.
               */
              TAILQ_FOREACH(pic, &pics, pics) {
                      if (pic->pic_register_sources != NULL)
                              pic->pic_register_sources(pic);
              }
      }
      SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
          NULL);
      
      /*
       * Register a new interrupt source with the global interrupt system.
       * The global interrupts need to be disabled when this function is
       * called.
       */
      int
      intr_register_source(struct intsrc *isrc)
      {
              int error, vector;
      
              KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
              vector = isrc->is_pic->pic_vector(isrc);
              KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
                  num_io_irqs));
              if (interrupt_sources[vector] != NULL)
                      return (EEXIST);
              error = intr_event_create(&isrc->is_event, isrc, 0, vector,
                  intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
                  (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
                  vector);
              if (error)
                      return (error);
              sx_xlock(&intrsrc_lock);
              if (interrupt_sources[vector] != NULL) {
                      sx_xunlock(&intrsrc_lock);
                      intr_event_destroy(isrc->is_event);
                      return (EEXIST);
              }
              intrcnt_register(isrc);
              interrupt_sources[vector] = isrc;
              isrc->is_handlers = 0;
              sx_xunlock(&intrsrc_lock);
              return (0);
      }
      
      struct intsrc *
      intr_lookup_source(int vector)
 2596 {
      
              if (vector < 0 || vector >= num_io_irqs)
                      return (NULL);
 2596         return (interrupt_sources[vector]);
      }
      
      int
      intr_add_handler(const char *name, int vector, driver_filter_t filter,
          driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
          int domain)
      {
              struct intsrc *isrc;
              int error;
      
              isrc = intr_lookup_source(vector);
              if (isrc == NULL)
                      return (EINVAL);
              error = intr_event_add_handler(isrc->is_event, name, filter, handler,
                  arg, intr_priority(flags), flags, cookiep);
              if (error == 0) {
                      sx_xlock(&intrsrc_lock);
                      intrcnt_updatename(isrc);
                      isrc->is_handlers++;
                      if (isrc->is_handlers == 1) {
                              isrc->is_domain = domain;
                              isrc->is_pic->pic_enable_intr(isrc);
                              isrc->is_pic->pic_enable_source(isrc);
                      }
                      sx_xunlock(&intrsrc_lock);
              }
              return (error);
      }
      
      int
      intr_remove_handler(void *cookie)
      {
              struct intsrc *isrc;
              int error;
      
              isrc = intr_handler_source(cookie);
              error = intr_event_remove_handler(cookie);
              if (error == 0) {
                      sx_xlock(&intrsrc_lock);
                      isrc->is_handlers--;
                      if (isrc->is_handlers == 0) {
                              isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
                              isrc->is_pic->pic_disable_intr(isrc);
                      }
                      intrcnt_updatename(isrc);
                      sx_xunlock(&intrsrc_lock);
              }
              return (error);
      }
      
      int
      intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
      {
              struct intsrc *isrc;
      
              isrc = intr_lookup_source(vector);
              if (isrc == NULL)
                      return (EINVAL);
              return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
      }
      
      static void
      intr_disable_src(void *arg)
      {
              struct intsrc *isrc;
      
              isrc = arg;
              isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
      }
      
      void
      intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
 2594 {
              struct intr_event *ie;
              int vector;
      
              /*
               * We count software interrupts when we process them.  The
               * code here follows previous practice, but there's an
               * argument for counting hardware interrupts when they're
               * processed too.
               */
              (*isrc->is_count)++;
              VM_CNT_INC(v_intr);
      
              ie = isrc->is_event;
      
              /*
               * XXX: We assume that IRQ 0 is only used for the ISA timer
               * device (clk).
               */
              vector = isrc->is_pic->pic_vector(isrc);
 2595         if (vector == 0)
                      clkintr_pending = 1;
      
              /*
               * For stray interrupts, mask and EOI the source, bump the
               * stray count, and log the condition.
               */
 2595         if (intr_event_handle(ie, frame) != 0) {
                      isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
                      (*isrc->is_straycount)++;
                      if (*isrc->is_straycount < MAX_STRAY_LOG)
                              log(LOG_ERR, "stray irq%d\n", vector);
                      else if (*isrc->is_straycount == MAX_STRAY_LOG)
                              log(LOG_CRIT,
                                  "too many stray irq %d's: not logging anymore\n",
                                  vector);
              }
      }
      
      void
      intr_resume(bool suspend_cancelled)
      {
              struct pic *pic;
      
      #ifndef DEV_ATPIC
              atpic_reset();
      #endif
              mtx_lock(&intrpic_lock);
              TAILQ_FOREACH(pic, &pics, pics) {
                      if (pic->pic_resume != NULL)
                              pic->pic_resume(pic, suspend_cancelled);
              }
              mtx_unlock(&intrpic_lock);
      }
      
      void
      intr_suspend(void)
      {
              struct pic *pic;
      
              mtx_lock(&intrpic_lock);
              TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
                      if (pic->pic_suspend != NULL)
                              pic->pic_suspend(pic);
              }
              mtx_unlock(&intrpic_lock);
      }
      
      static int
      intr_assign_cpu(void *arg, int cpu)
      {
      #ifdef SMP
              struct intsrc *isrc;
              int error;
      
      #ifdef EARLY_AP_STARTUP
              MPASS(mp_ncpus == 1 || smp_started);
      
              /* Nothing to do if there is only a single CPU. */
              if (mp_ncpus > 1 && cpu != NOCPU) {
      #else
              /*
               * Don't do anything during early boot.  We will pick up the
               * assignment once the APs are started.
               */
              if (assign_cpu && cpu != NOCPU) {
      #endif
                      isrc = arg;
                      sx_xlock(&intrsrc_lock);
                      error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
                      if (error == 0)
                              isrc->is_cpu = cpu;
                      sx_xunlock(&intrsrc_lock);
              } else
                      error = 0;
              return (error);
      #else
              return (EOPNOTSUPP);
      #endif
      }
      
      static void
      intrcnt_setname(const char *name, int index)
      {
      
              snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
                  MAXCOMLEN, name);
      }
      
      static void
      intrcnt_updatename(struct intsrc *is)
      {
      
              intrcnt_setname(is->is_event->ie_fullname, is->is_index);
      }
      
      static void
      intrcnt_register(struct intsrc *is)
      {
              char straystr[MAXCOMLEN + 1];
      
              KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
              mtx_lock_spin(&intrcnt_lock);
              MPASS(intrcnt_index + 2 <= nintrcnt);
              is->is_index = intrcnt_index;
              intrcnt_index += 2;
              snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
                  is->is_pic->pic_vector(is));
              intrcnt_updatename(is);
              is->is_count = &intrcnt[is->is_index];
              intrcnt_setname(straystr, is->is_index + 1);
              is->is_straycount = &intrcnt[is->is_index + 1];
              mtx_unlock_spin(&intrcnt_lock);
      }
      
      void
      intrcnt_add(const char *name, u_long **countp)
      {
      
              mtx_lock_spin(&intrcnt_lock);
              MPASS(intrcnt_index < nintrcnt);
              *countp = &intrcnt[intrcnt_index];
              intrcnt_setname(name, intrcnt_index);
              intrcnt_index++;
              mtx_unlock_spin(&intrcnt_lock);
      }
      
      static void
      intr_init(void *dummy __unused)
      {
      
              TAILQ_INIT(&pics);
              mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
              sx_init(&intrsrc_lock, "intrsrc");
              mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
      }
      SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
      
      static void
      intr_init_final(void *dummy __unused)
      {
      
              /*
               * Enable interrupts on the BSP after all of the interrupt
               * controllers are initialized.  Device interrupts are still
               * disabled in the interrupt controllers until interrupt
               * handlers are registered.  Interrupts are enabled on each AP
               * after their first context switch.
               */
              enable_intr();
      }
      SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
      
      #ifndef DEV_ATPIC
      /* Initialize the two 8259A's to a known-good shutdown state. */
      void
      atpic_reset(void)
      {
      
              outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
              outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
              outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
              outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
              outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
              outb(IO_ICU1, OCW3_SEL | OCW3_RR);
      
              outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
              outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
              outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
              outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
              outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
              outb(IO_ICU2, OCW3_SEL | OCW3_RR);
      }
      #endif
      
      /* Add a description to an active interrupt handler. */
      int
      intr_describe(u_int vector, void *ih, const char *descr)
      {
              struct intsrc *isrc;
              int error;
      
              isrc = intr_lookup_source(vector);
              if (isrc == NULL)
                      return (EINVAL);
              error = intr_event_describe_handler(isrc->is_event, ih, descr);
              if (error)
                      return (error);
              intrcnt_updatename(isrc);
              return (0);
      }
      
      void
      intr_reprogram(void)
      {
              struct intsrc *is;
              u_int v;
      
              sx_xlock(&intrsrc_lock);
              for (v = 0; v < num_io_irqs; v++) {
                      is = interrupt_sources[v];
                      if (is == NULL)
                              continue;
                      if (is->is_pic->pic_reprogram_pin != NULL)
                              is->is_pic->pic_reprogram_pin(is);
              }
              sx_xunlock(&intrsrc_lock);
      }
      
      #ifdef DDB
      /*
       * Dump data about interrupt handlers
       */
      DB_SHOW_COMMAND(irqs, db_show_irqs)
      {
              struct intsrc **isrc;
              u_int i;
              int verbose;
      
              if (strcmp(modif, "v") == 0)
                      verbose = 1;
              else
                      verbose = 0;
              isrc = interrupt_sources;
              for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
                      if (*isrc != NULL)
                              db_dump_intr_event((*isrc)->is_event, verbose);
      }
      #endif
      
      #ifdef SMP
      /*
       * Support for balancing interrupt sources across CPUs.  For now we just
       * allocate CPUs round-robin.
       */
      
      cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
      static int current_cpu[MAXMEMDOM];
      
      static void
      intr_init_cpus(void)
      {
              int i;
      
              for (i = 0; i < vm_ndomains; i++) {
                      current_cpu[i] = 0;
                      if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
                          !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
                              intr_next_cpu(i);
              }
      }
      
      /*
       * Return the CPU that the next interrupt source should use.  For now
       * this just returns the next local APIC according to round-robin.
       */
      u_int
      intr_next_cpu(int domain)
      {
              u_int apic_id;
      
      #ifdef EARLY_AP_STARTUP
              MPASS(mp_ncpus == 1 || smp_started);
              if (mp_ncpus == 1)
                      return (PCPU_GET(apic_id));
      #else
              /* Leave all interrupts on the BSP during boot. */
              if (!assign_cpu)
                      return (PCPU_GET(apic_id));
      #endif
      
              mtx_lock_spin(&icu_lock);
              apic_id = cpu_apic_ids[current_cpu[domain]];
              do {
                      current_cpu[domain]++;
                      if (current_cpu[domain] > mp_maxid)
                              current_cpu[domain] = 0;
              } while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
                  !CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]));
              mtx_unlock_spin(&icu_lock);
              return (apic_id);
      }
      
      /* Attempt to bind the specified IRQ to the specified CPU. */
      int
      intr_bind(u_int vector, u_char cpu)
      {
              struct intsrc *isrc;
      
              isrc = intr_lookup_source(vector);
              if (isrc == NULL)
                      return (EINVAL);
              return (intr_event_bind(isrc->is_event, cpu));
      }
      
      /*
       * Add a CPU to our mask of valid CPUs that can be destinations of
       * interrupts.
       */
      void
      intr_add_cpu(u_int cpu)
      {
      
              if (cpu >= MAXCPU)
                      panic("%s: Invalid CPU ID", __func__);
              if (bootverbose)
                      printf("INTR: Adding local APIC %d as a target\n",
                          cpu_apic_ids[cpu]);
      
              CPU_SET(cpu, &intr_cpus);
      }
      
      #ifdef EARLY_AP_STARTUP
      static void
      intr_smp_startup(void *arg __unused)
      {
      
              intr_init_cpus();
              return;
      }
      SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
          NULL);
      
      #else
      /*
       * Distribute all the interrupt sources among the available CPUs once the
       * AP's have been launched.
       */
      static void
      intr_shuffle_irqs(void *arg __unused)
      {
              struct intsrc *isrc;
              u_int cpu, i;
      
              intr_init_cpus();
              /* Don't bother on UP. */
              if (mp_ncpus == 1)
                      return;
      
              /* Round-robin assign a CPU to each enabled source. */
              sx_xlock(&intrsrc_lock);
              assign_cpu = 1;
              for (i = 0; i < num_io_irqs; i++) {
                      isrc = interrupt_sources[i];
                      if (isrc != NULL && isrc->is_handlers > 0) {
                              /*
                               * If this event is already bound to a CPU,
                               * then assign the source to that CPU instead
                               * of picking one via round-robin.  Note that
                               * this is careful to only advance the
                               * round-robin if the CPU assignment succeeds.
                               */
                              cpu = isrc->is_event->ie_cpu;
                              if (cpu == NOCPU)
                                      cpu = current_cpu[isrc->is_domain];
                              if (isrc->is_pic->pic_assign_cpu(isrc,
                                  cpu_apic_ids[cpu]) == 0) {
                                      isrc->is_cpu = cpu;
                                      if (isrc->is_event->ie_cpu == NOCPU)
                                              intr_next_cpu(isrc->is_domain);
                              }
                      }
              }
              sx_xunlock(&intrsrc_lock);
      }
      SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
          NULL);
      #endif
      
      /*
       * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
       */
      static int
      sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
      {
              struct sbuf sbuf;
              struct intsrc *isrc;
              u_int i;
              int error;
      
              error = sysctl_wire_old_buffer(req, 0);
              if (error != 0)
                      return (error);
      
              sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
              sx_slock(&intrsrc_lock);
              for (i = 0; i < num_io_irqs; i++) {
                      isrc = interrupt_sources[i];
                      if (isrc == NULL)
                              continue;
                      sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
                          isrc->is_event->ie_fullname,
                          isrc->is_index,
                          isrc->is_cpu,
                          isrc->is_domain,
                          *isrc->is_count);
              }
      
              sx_sunlock(&intrsrc_lock);
              error = sbuf_finish(&sbuf);
              sbuf_delete(&sbuf);
              return (error);
      }
      SYSCTL_PROC(_hw, OID_AUTO, intrs,
          CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
          0, 0, sysctl_hw_intrs, "A",
          "interrupt:number @cpu: count");
      
      /*
       * Compare two, possibly NULL, entries in the interrupt source array
       * by load.
       */
      static int
      intrcmp(const void *one, const void *two)
      {
              const struct intsrc *i1, *i2;
      
              i1 = *(const struct intsrc * const *)one;
              i2 = *(const struct intsrc * const *)two;
              if (i1 != NULL && i2 != NULL)
                      return (*i1->is_count - *i2->is_count);
              if (i1 != NULL)
                      return (1);
              if (i2 != NULL)
                      return (-1);
              return (0);
      }
      
      /*
       * Balance IRQs across available CPUs according to load.
       */
      static void
      intr_balance(void *dummy __unused, int pending __unused)
      {
              struct intsrc *isrc;
              int interval;
              u_int cpu;
              int i;
      
              interval = intrbalance;
              if (interval == 0)
                      goto out;
      
              /*
               * Sort interrupts according to count.
               */
              sx_xlock(&intrsrc_lock);
              memcpy(interrupt_sorted, interrupt_sources, num_io_irqs *
                  sizeof(interrupt_sorted[0]));
              qsort(interrupt_sorted, num_io_irqs, sizeof(interrupt_sorted[0]),
                  intrcmp);
      
              /*
               * Restart the scan from the same location to avoid moving in the
               * common case.
               */
              intr_init_cpus();
      
              /*
               * Assign round-robin from most loaded to least.
               */
              for (i = num_io_irqs - 1; i >= 0; i--) {
                      isrc = interrupt_sorted[i];
                      if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
                              continue;
                      cpu = current_cpu[isrc->is_domain];
                      intr_next_cpu(isrc->is_domain);
                      if (isrc->is_cpu != cpu &&
                          isrc->is_pic->pic_assign_cpu(isrc,
                          cpu_apic_ids[cpu]) == 0)
                              isrc->is_cpu = cpu;
              }
              sx_xunlock(&intrsrc_lock);
      out:
              taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
                  interval ? hz * interval : hz * 60);
      
      }
      
      static void
      intr_balance_init(void *dummy __unused)
      {
      
              TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
                  NULL);
              taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
      }
      SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
      
      #else
      /*
       * Always route interrupts to the current processor in the UP case.
       */
      u_int
      intr_next_cpu(int domain)
      {
      
              return (PCPU_GET(apic_id));
      }
      #endif
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 2017 Dell EMC
       * Copyright (c) 2000-2001, 2003 David O'Brien
       * Copyright (c) 1995-1996 Søren Schmidt
       * Copyright (c) 1996 Peter Wemm
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer
       *    in this position and unchanged.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. The name of the author may not be used to endorse or promote products
       *    derived from this software without specific prior written permission
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_capsicum.h"
      
      #include <sys/param.h>
      #include <sys/capsicum.h>
      #include <sys/compressor.h>
      #include <sys/exec.h>
      #include <sys/fcntl.h>
      #include <sys/imgact.h>
      #include <sys/imgact_elf.h>
      #include <sys/jail.h>
      #include <sys/kernel.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mount.h>
      #include <sys/mman.h>
      #include <sys/namei.h>
      #include <sys/proc.h>
      #include <sys/procfs.h>
      #include <sys/ptrace.h>
      #include <sys/racct.h>
      #include <sys/resourcevar.h>
      #include <sys/rwlock.h>
      #include <sys/sbuf.h>
      #include <sys/sf_buf.h>
      #include <sys/smp.h>
      #include <sys/systm.h>
      #include <sys/signalvar.h>
      #include <sys/stat.h>
      #include <sys/sx.h>
      #include <sys/syscall.h>
      #include <sys/sysctl.h>
      #include <sys/sysent.h>
      #include <sys/vnode.h>
      #include <sys/syslog.h>
      #include <sys/eventhandler.h>
      #include <sys/user.h>
      
      #include <vm/vm.h>
      #include <vm/vm_kern.h>
      #include <vm/vm_param.h>
      #include <vm/pmap.h>
      #include <vm/vm_map.h>
      #include <vm/vm_object.h>
      #include <vm/vm_extern.h>
      
      #include <machine/elf.h>
      #include <machine/md_var.h>
      
      #define ELF_NOTE_ROUNDSIZE        4
      #define OLD_EI_BRAND        8
      
      static int __elfN(check_header)(const Elf_Ehdr *hdr);
      static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
          const char *interp, int32_t *osrel, uint32_t *fctl0);
      static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
          u_long *entry);
      static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
          caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);
      static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
      static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note,
          int32_t *osrel);
      static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
      static boolean_t __elfN(check_note)(struct image_params *imgp,
          Elf_Brandnote *checknote, int32_t *osrel, boolean_t *has_fctl0,
          uint32_t *fctl0);
      static vm_prot_t __elfN(trans_prot)(Elf_Word);
      static Elf_Word __elfN(untrans_prot)(vm_prot_t);
      
      SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE),
          CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "");
      
      #define        CORE_BUF_SIZE        (16 * 1024)
      
      int __elfN(fallback_brand) = -1;
      SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
          fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0,
          __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
      
      static int elf_legacy_coredump = 0;
      SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
          &elf_legacy_coredump, 0,
          "include all and only RW pages in core dumps");
      
      int __elfN(nxstack) =
      #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \
          (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__) || \
          defined(__riscv)
              1;
      #else
              0;
      #endif
      SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
          nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
          __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
      
      #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
      int i386_read_exec = 0;
      SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
          "enable execution from readable segments");
      #endif
      
      static u_long __elfN(pie_base) = ET_DYN_LOAD_ADDR;
      static int
      sysctl_pie_base(SYSCTL_HANDLER_ARGS)
      {
              u_long val;
              int error;
      
              val = __elfN(pie_base);
              error = sysctl_handle_long(oidp, &val, 0, req);
              if (error != 0 || req->newptr == NULL)
                      return (error);
              if ((val & PAGE_MASK) != 0)
                      return (EINVAL);
              __elfN(pie_base) = val;
              return (0);
      }
      SYSCTL_PROC(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, pie_base,
          CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
          sysctl_pie_base, "LU",
          "PIE load base without randomization");
      
      SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr,
          CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "");
      #define        ASLR_NODE_OID        __CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr)
      
      static int __elfN(aslr_enabled) = 0;
      SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN,
          &__elfN(aslr_enabled), 0,
          __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
          ": enable address map randomization");
      
      static int __elfN(pie_aslr_enabled) = 0;
      SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN,
          &__elfN(pie_aslr_enabled), 0,
          __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
          ": enable address map randomization for PIE binaries");
      
      static int __elfN(aslr_honor_sbrk) = 1;
      SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW,
          &__elfN(aslr_honor_sbrk), 0,
          __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used");
      
      static int __elfN(aslr_stack_gap) = 3;
      SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack_gap, CTLFLAG_RW,
          &__elfN(aslr_stack_gap), 0,
          __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
          ": maximum percentage of main stack to waste on a random gap");
      
      static int __elfN(sigfastblock) = 1;
      SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, sigfastblock,
          CTLFLAG_RWTUN, &__elfN(sigfastblock), 0,
          "enable sigfastblock for new processes");
      
      static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
      
      #define        aligned(a, t)        (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a))
      
      static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
      
      Elf_Brandnote __elfN(freebsd_brandnote) = {
              .hdr.n_namesz        = sizeof(FREEBSD_ABI_VENDOR),
              .hdr.n_descsz        = sizeof(int32_t),
              .hdr.n_type        = NT_FREEBSD_ABI_TAG,
              .vendor                = FREEBSD_ABI_VENDOR,
              .flags                = BN_TRANSLATE_OSREL,
              .trans_osrel        = __elfN(freebsd_trans_osrel)
      };
      
      static bool
      __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
      {
              uintptr_t p;
      
              p = (uintptr_t)(note + 1);
              p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
              *osrel = *(const int32_t *)(p);
      
              return (true);
      }
      
      static const char GNU_ABI_VENDOR[] = "GNU";
      static int GNU_KFREEBSD_ABI_DESC = 3;
      
      Elf_Brandnote __elfN(kfreebsd_brandnote) = {
              .hdr.n_namesz        = sizeof(GNU_ABI_VENDOR),
              .hdr.n_descsz        = 16,        /* XXX at least 16 */
              .hdr.n_type        = 1,
              .vendor                = GNU_ABI_VENDOR,
              .flags                = BN_TRANSLATE_OSREL,
              .trans_osrel        = kfreebsd_trans_osrel
      };
      
      static bool
      kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
      {
              const Elf32_Word *desc;
              uintptr_t p;
      
              p = (uintptr_t)(note + 1);
              p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
      
              desc = (const Elf32_Word *)p;
              if (desc[0] != GNU_KFREEBSD_ABI_DESC)
                      return (false);
      
              /*
               * Debian GNU/kFreeBSD embed the earliest compatible kernel version
               * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
               */
              *osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
      
              return (true);
      }
      
      int
      __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
      {
              int i;
      
              for (i = 0; i < MAX_BRANDS; i++) {
                      if (elf_brand_list[i] == NULL) {
                              elf_brand_list[i] = entry;
                              break;
                      }
              }
              if (i == MAX_BRANDS) {
                      printf("WARNING: %s: could not insert brandinfo entry: %p\n",
                              __func__, entry);
                      return (-1);
              }
              return (0);
      }
      
      int
      __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
      {
              int i;
      
              for (i = 0; i < MAX_BRANDS; i++) {
                      if (elf_brand_list[i] == entry) {
                              elf_brand_list[i] = NULL;
                              break;
                      }
              }
              if (i == MAX_BRANDS)
                      return (-1);
              return (0);
      }
      
      int
      __elfN(brand_inuse)(Elf_Brandinfo *entry)
      {
              struct proc *p;
              int rval = FALSE;
      
              sx_slock(&allproc_lock);
              FOREACH_PROC_IN_SYSTEM(p) {
                      if (p->p_sysent == entry->sysvec) {
                              rval = TRUE;
                              break;
                      }
              }
              sx_sunlock(&allproc_lock);
      
              return (rval);
      }
      
      static Elf_Brandinfo *
      __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
          int32_t *osrel, uint32_t *fctl0)
      {
              const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
              Elf_Brandinfo *bi, *bi_m;
              boolean_t ret, has_fctl0;
              int i, interp_name_len;
      
              interp_name_len = interp != NULL ? strlen(interp) + 1 : 0;
      
              /*
               * We support four types of branding -- (1) the ELF EI_OSABI field
               * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
               * branding w/in the ELF header, (3) path of the `interp_path'
               * field, and (4) the ".note.ABI-tag" ELF section.
               */
      
              /* Look for an ".note.ABI-tag" ELF section */
              bi_m = NULL;
              for (i = 0; i < MAX_BRANDS; i++) {
                      bi = elf_brand_list[i];
                      if (bi == NULL)
                              continue;
                      if (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)
                              continue;
                      if (hdr->e_machine == bi->machine && (bi->flags &
                          (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
                              has_fctl0 = false;
                              *fctl0 = 0;
                              *osrel = 0;
                              ret = __elfN(check_note)(imgp, bi->brand_note, osrel,
                                  &has_fctl0, fctl0);
                              /* Give brand a chance to veto check_note's guess */
                              if (ret && bi->header_supported) {
                                      ret = bi->header_supported(imgp, osrel,
                                          has_fctl0 ? fctl0 : NULL);
                              }
                              /*
                               * If note checker claimed the binary, but the
                               * interpreter path in the image does not
                               * match default one for the brand, try to
                               * search for other brands with the same
                               * interpreter.  Either there is better brand
                               * with the right interpreter, or, failing
                               * this, we return first brand which accepted
                               * our note and, optionally, header.
                               */
                              if (ret && bi_m == NULL && interp != NULL &&
                                  (bi->interp_path == NULL ||
                                  (strlen(bi->interp_path) + 1 != interp_name_len ||
                                  strncmp(interp, bi->interp_path, interp_name_len)
                                  != 0))) {
                                      bi_m = bi;
                                      ret = 0;
                              }
                              if (ret)
                                      return (bi);
                      }
              }
              if (bi_m != NULL)
                      return (bi_m);
      
              /* If the executable has a brand, search for it in the brand list. */
              for (i = 0; i < MAX_BRANDS; i++) {
                      bi = elf_brand_list[i];
                      if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
                          (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
                              continue;
                      if (hdr->e_machine == bi->machine &&
                          (hdr->e_ident[EI_OSABI] == bi->brand ||
                          (bi->compat_3_brand != NULL &&
                          strcmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
                          bi->compat_3_brand) == 0))) {
                              /* Looks good, but give brand a chance to veto */
                              if (bi->header_supported == NULL ||
                                  bi->header_supported(imgp, NULL, NULL)) {
                                      /*
                                       * Again, prefer strictly matching
                                       * interpreter path.
                                       */
                                      if (interp_name_len == 0 &&
                                          bi->interp_path == NULL)
                                              return (bi);
                                      if (bi->interp_path != NULL &&
                                          strlen(bi->interp_path) + 1 ==
                                          interp_name_len && strncmp(interp,
                                          bi->interp_path, interp_name_len) == 0)
                                              return (bi);
                                      if (bi_m == NULL)
                                              bi_m = bi;
                              }
                      }
              }
              if (bi_m != NULL)
                      return (bi_m);
      
              /* No known brand, see if the header is recognized by any brand */
              for (i = 0; i < MAX_BRANDS; i++) {
                      bi = elf_brand_list[i];
                      if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY ||
                          bi->header_supported == NULL)
                              continue;
                      if (hdr->e_machine == bi->machine) {
                              ret = bi->header_supported(imgp, NULL, NULL);
                              if (ret)
                                      return (bi);
                      }
              }
      
              /* Lacking a known brand, search for a recognized interpreter. */
              if (interp != NULL) {
                      for (i = 0; i < MAX_BRANDS; i++) {
                              bi = elf_brand_list[i];
                              if (bi == NULL || (bi->flags &
                                  (BI_BRAND_NOTE_MANDATORY | BI_BRAND_ONLY_STATIC))
                                  != 0)
                                      continue;
                              if (hdr->e_machine == bi->machine &&
                                  bi->interp_path != NULL &&
                                  /* ELF image p_filesz includes terminating zero */
                                  strlen(bi->interp_path) + 1 == interp_name_len &&
                                  strncmp(interp, bi->interp_path, interp_name_len)
                                  == 0 && (bi->header_supported == NULL ||
                                  bi->header_supported(imgp, NULL, NULL)))
                                      return (bi);
                      }
              }
      
              /* Lacking a recognized interpreter, try the default brand */
              for (i = 0; i < MAX_BRANDS; i++) {
                      bi = elf_brand_list[i];
                      if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
                          (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
                              continue;
                      if (hdr->e_machine == bi->machine &&
                          __elfN(fallback_brand) == bi->brand &&
                          (bi->header_supported == NULL ||
                          bi->header_supported(imgp, NULL, NULL)))
                              return (bi);
              }
              return (NULL);
      }
      
      static bool
      __elfN(phdr_in_zero_page)(const Elf_Ehdr *hdr)
      {
              return (hdr->e_phoff <= PAGE_SIZE &&
                  (u_int)hdr->e_phentsize * hdr->e_phnum <= PAGE_SIZE - hdr->e_phoff);
      }
      
      static int
      __elfN(check_header)(const Elf_Ehdr *hdr)
   17 {
              Elf_Brandinfo *bi;
              int i;
      
   17         if (!IS_ELF(*hdr) ||
                  hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
                  hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
                  hdr->e_ident[EI_VERSION] != EV_CURRENT ||
                  hdr->e_phentsize != sizeof(Elf_Phdr) ||
                  hdr->e_version != ELF_TARG_VER)
                      return (ENOEXEC);
      
              /*
               * Make sure we have at least one brand for this machine.
               */
      
              for (i = 0; i < MAX_BRANDS; i++) {
                      bi = elf_brand_list[i];
                      if (bi != NULL && bi->machine == hdr->e_machine)
                              break;
              }
              if (i == MAX_BRANDS)
                      return (ENOEXEC);
      
              return (0);
      }
      
      static int
      __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
          vm_offset_t start, vm_offset_t end, vm_prot_t prot)
      {
              struct sf_buf *sf;
              int error;
              vm_offset_t off;
      
              /*
               * Create the page if it doesn't exist yet. Ignore errors.
               */
              vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) -
                  trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL);
      
              /*
               * Find the page from the underlying object.
               */
              if (object != NULL) {
                      sf = vm_imgact_map_page(object, offset);
                      if (sf == NULL)
                              return (KERN_FAILURE);
                      off = offset - trunc_page(offset);
                      error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
                          end - start);
                      vm_imgact_unmap_page(sf);
                      if (error != 0)
                              return (KERN_FAILURE);
              }
      
              return (KERN_SUCCESS);
      }
      
      static int
      __elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object,
          vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot,
          int cow)
      {
              struct sf_buf *sf;
              vm_offset_t off;
              vm_size_t sz;
              int error, locked, rv;
      
              if (start != trunc_page(start)) {
                      rv = __elfN(map_partial)(map, object, offset, start,
                          round_page(start), prot);
                      if (rv != KERN_SUCCESS)
                              return (rv);
                      offset += round_page(start) - start;
                      start = round_page(start);
              }
              if (end != round_page(end)) {
                      rv = __elfN(map_partial)(map, object, offset +
                          trunc_page(end) - start, trunc_page(end), end, prot);
                      if (rv != KERN_SUCCESS)
                              return (rv);
                      end = trunc_page(end);
              }
              if (start >= end)
                      return (KERN_SUCCESS);
              if ((offset & PAGE_MASK) != 0) {
                      /*
                       * The mapping is not page aligned.  This means that we have
                       * to copy the data.
                       */
                      rv = vm_map_fixed(map, NULL, 0, start, end - start,
                          prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL);
                      if (rv != KERN_SUCCESS)
                              return (rv);
                      if (object == NULL)
                              return (KERN_SUCCESS);
                      for (; start < end; start += sz) {
                              sf = vm_imgact_map_page(object, offset);
                              if (sf == NULL)
                                      return (KERN_FAILURE);
                              off = offset - trunc_page(offset);
                              sz = end - start;
                              if (sz > PAGE_SIZE - off)
                                      sz = PAGE_SIZE - off;
                              error = copyout((caddr_t)sf_buf_kva(sf) + off,
                                  (caddr_t)start, sz);
                              vm_imgact_unmap_page(sf);
                              if (error != 0)
                                      return (KERN_FAILURE);
                              offset += sz;
                      }
              } else {
                      vm_object_reference(object);
                      rv = vm_map_fixed(map, object, offset, start, end - start,
                          prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL |
                          (object != NULL ? MAP_VN_EXEC : 0));
                      if (rv != KERN_SUCCESS) {
                              locked = VOP_ISLOCKED(imgp->vp);
                              VOP_UNLOCK(imgp->vp);
                              vm_object_deallocate(object);
                              vn_lock(imgp->vp, locked | LK_RETRY);
                              return (rv);
                      } else if (object != NULL) {
                              MPASS(imgp->vp->v_object == object);
                              VOP_SET_TEXT_CHECKED(imgp->vp);
                      }
              }
              return (KERN_SUCCESS);
      }
      
      static int
      __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
          caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
      {
              struct sf_buf *sf;
              size_t map_len;
              vm_map_t map;
              vm_object_t object;
              vm_offset_t map_addr;
              int error, rv, cow;
              size_t copy_len;
              vm_ooffset_t file_addr;
      
              /*
               * It's necessary to fail if the filsz + offset taken from the
               * header is greater than the actual file pager object's size.
               * If we were to allow this, then the vm_map_find() below would
               * walk right off the end of the file object and into the ether.
               *
               * While I'm here, might as well check for something else that
               * is invalid: filsz cannot be greater than memsz.
               */
              if ((filsz != 0 && (off_t)filsz + offset > imgp->attr->va_size) ||
                  filsz > memsz) {
                      uprintf("elf_load_section: truncated ELF file\n");
                      return (ENOEXEC);
              }
      
              object = imgp->object;
              map = &imgp->proc->p_vmspace->vm_map;
              map_addr = trunc_page((vm_offset_t)vmaddr);
              file_addr = trunc_page(offset);
      
              /*
               * We have two choices.  We can either clear the data in the last page
               * of an oversized mapping, or we can start the anon mapping a page
               * early and copy the initialized data into that first page.  We
               * choose the second.
               */
              if (filsz == 0)
                      map_len = 0;
              else if (memsz > filsz)
                      map_len = trunc_page(offset + filsz) - file_addr;
              else
                      map_len = round_page(offset + filsz) - file_addr;
      
              if (map_len != 0) {
                      /* cow flags: don't dump readonly sections in core */
                      cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
                          (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
      
                      rv = __elfN(map_insert)(imgp, map, object, file_addr,
                          map_addr, map_addr + map_len, prot, cow);
                      if (rv != KERN_SUCCESS)
                              return (EINVAL);
      
                      /* we can stop now if we've covered it all */
                      if (memsz == filsz)
                              return (0);
              }
      
              /*
               * We have to get the remaining bit of the file into the first part
               * of the oversized map segment.  This is normally because the .data
               * segment in the file is extended to provide bss.  It's a neat idea
               * to try and save a page, but it's a pain in the behind to implement.
               */
              copy_len = filsz == 0 ? 0 : (offset + filsz) - trunc_page(offset +
                  filsz);
              map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
              map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
      
              /* This had damn well better be true! */
              if (map_len != 0) {
                      rv = __elfN(map_insert)(imgp, map, NULL, 0, map_addr,
                          map_addr + map_len, prot, 0);
                      if (rv != KERN_SUCCESS)
                              return (EINVAL);
              }
      
              if (copy_len != 0) {
                      sf = vm_imgact_map_page(object, offset + filsz);
                      if (sf == NULL)
                              return (EIO);
      
                      /* send the page fragment to user space */
                      error = copyout((caddr_t)sf_buf_kva(sf), (caddr_t)map_addr,
                          copy_len);
                      vm_imgact_unmap_page(sf);
                      if (error != 0)
                              return (error);
              }
      
              /*
               * Remove write access to the page if it was only granted by map_insert
               * to allow copyout.
               */
              if ((prot & VM_PROT_WRITE) == 0)
                      vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
                          map_len), prot, FALSE);
      
              return (0);
      }
      
      static int
      __elfN(load_sections)(struct image_params *imgp, const Elf_Ehdr *hdr,
          const Elf_Phdr *phdr, u_long rbase, u_long *base_addrp)
      {
              vm_prot_t prot;
              u_long base_addr;
              bool first;
              int error, i;
      
              ASSERT_VOP_LOCKED(imgp->vp, __func__);
      
              base_addr = 0;
              first = true;
      
              for (i = 0; i < hdr->e_phnum; i++) {
                      if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
                              continue;
      
                      /* Loadable segment */
                      prot = __elfN(trans_prot)(phdr[i].p_flags);
                      error = __elfN(load_section)(imgp, phdr[i].p_offset,
                          (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
                          phdr[i].p_memsz, phdr[i].p_filesz, prot);
                      if (error != 0)
                              return (error);
      
                      /*
                       * Establish the base address if this is the first segment.
                       */
                      if (first) {
                                base_addr = trunc_page(phdr[i].p_vaddr + rbase);
                              first = false;
                      }
              }
      
              if (base_addrp != NULL)
                      *base_addrp = base_addr;
      
              return (0);
      }
      
      /*
       * Load the file "file" into memory.  It may be either a shared object
       * or an executable.
       *
       * The "addr" reference parameter is in/out.  On entry, it specifies
       * the address where a shared object should be loaded.  If the file is
       * an executable, this value is ignored.  On exit, "addr" specifies
       * where the file was actually loaded.
       *
       * The "entry" reference parameter is out only.  On exit, it specifies
       * the entry point for the loaded file.
       */
      static int
      __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
              u_long *entry)
      {
              struct {
                      struct nameidata nd;
                      struct vattr attr;
                      struct image_params image_params;
              } *tempdata;
              const Elf_Ehdr *hdr = NULL;
              const Elf_Phdr *phdr = NULL;
              struct nameidata *nd;
              struct vattr *attr;
              struct image_params *imgp;
              u_long rbase;
              u_long base_addr = 0;
              int error;
      
      #ifdef CAPABILITY_MODE
              /*
               * XXXJA: This check can go away once we are sufficiently confident
               * that the checks in namei() are correct.
               */
              if (IN_CAPABILITY_MODE(curthread))
                      return (ECAPMODE);
      #endif
      
              tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK | M_ZERO);
              nd = &tempdata->nd;
              attr = &tempdata->attr;
              imgp = &tempdata->image_params;
      
              /*
               * Initialize part of the common data
               */
              imgp->proc = p;
              imgp->attr = attr;
      
              NDINIT(nd, LOOKUP, ISOPEN | FOLLOW | LOCKSHARED | LOCKLEAF,
                  UIO_SYSSPACE, file, curthread);
              if ((error = namei(nd)) != 0) {
                      nd->ni_vp = NULL;
                      goto fail;
              }
              NDFREE(nd, NDF_ONLY_PNBUF);
              imgp->vp = nd->ni_vp;
      
              /*
               * Check permissions, modes, uid, etc on the file, and "open" it.
               */
              error = exec_check_permissions(imgp);
              if (error)
                      goto fail;
      
              error = exec_map_first_page(imgp);
              if (error)
                      goto fail;
      
              imgp->object = nd->ni_vp->v_object;
      
              hdr = (const Elf_Ehdr *)imgp->image_header;
              if ((error = __elfN(check_header)(hdr)) != 0)
                      goto fail;
              if (hdr->e_type == ET_DYN)
                      rbase = *addr;
              else if (hdr->e_type == ET_EXEC)
                      rbase = 0;
              else {
                      error = ENOEXEC;
                      goto fail;
              }
      
              /* Only support headers that fit within first page for now      */
              if (!__elfN(phdr_in_zero_page)(hdr)) {
                      error = ENOEXEC;
                      goto fail;
              }
      
              phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
              if (!aligned(phdr, Elf_Addr)) {
                      error = ENOEXEC;
                      goto fail;
              }
      
              error = __elfN(load_sections)(imgp, hdr, phdr, rbase, &base_addr);
              if (error != 0)
                      goto fail;
      
              *addr = base_addr;
              *entry = (unsigned long)hdr->e_entry + rbase;
      
      fail:
              if (imgp->firstpage)
                      exec_unmap_first_page(imgp);
      
              if (nd->ni_vp) {
                      if (imgp->textset)
                              VOP_UNSET_TEXT_CHECKED(nd->ni_vp);
                      vput(nd->ni_vp);
              }
              free(tempdata, M_TEMP);
      
              return (error);
      }
      
      static u_long
      __CONCAT(rnd_, __elfN(base))(vm_map_t map __unused, u_long minv, u_long maxv,
          u_int align)
      {
              u_long rbase, res;
      
              MPASS(vm_map_min(map) <= minv);
              MPASS(maxv <= vm_map_max(map));
              MPASS(minv < maxv);
              MPASS(minv + align < maxv);
              arc4rand(&rbase, sizeof(rbase), 0);
              res = roundup(minv, (u_long)align) + rbase % (maxv - minv);
              res &= ~((u_long)align - 1);
              if (res >= maxv)
                      res -= align;
              KASSERT(res >= minv,
                  ("res %#lx < minv %#lx, maxv %#lx rbase %#lx",
                  res, minv, maxv, rbase));
              KASSERT(res < maxv,
                  ("res %#lx > maxv %#lx, minv %#lx rbase %#lx",
                  res, maxv, minv, rbase));
              return (res);
      }
      
      static int
      __elfN(enforce_limits)(struct image_params *imgp, const Elf_Ehdr *hdr,
          const Elf_Phdr *phdr, u_long et_dyn_addr)
      {
              struct vmspace *vmspace;
              const char *err_str;
              u_long text_size, data_size, total_size, text_addr, data_addr;
              u_long seg_size, seg_addr;
              int i;
      
              err_str = NULL;
              text_size = data_size = total_size = text_addr = data_addr = 0;
      
              for (i = 0; i < hdr->e_phnum; i++) {
                      if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
                              continue;
      
                      seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
                      seg_size = round_page(phdr[i].p_memsz +
                          phdr[i].p_vaddr + et_dyn_addr - seg_addr);
      
                      /*
                       * Make the largest executable segment the official
                       * text segment and all others data.
                       *
                       * Note that obreak() assumes that data_addr + data_size == end
                       * of data load area, and the ELF file format expects segments
                       * to be sorted by address.  If multiple data segments exist,
                       * the last one will be used.
                       */
      
                      if ((phdr[i].p_flags & PF_X) != 0 && text_size < seg_size) {
                              text_size = seg_size;
                              text_addr = seg_addr;
                      } else {
                              data_size = seg_size;
                              data_addr = seg_addr;
                      }
                      total_size += seg_size;
              }
      
              if (data_addr == 0 && data_size == 0) {
                      data_addr = text_addr;
                      data_size = text_size;
              }
      
              /*
               * Check limits.  It should be safe to check the
               * limits after loading the segments since we do
               * not actually fault in all the segments pages.
               */
              PROC_LOCK(imgp->proc);
              if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA))
                      err_str = "Data segment size exceeds process limit";
              else if (text_size > maxtsiz)
                      err_str = "Text segment size exceeds system limit";
              else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM))
                      err_str = "Total segment size exceeds process limit";
              else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0)
                      err_str = "Data segment size exceeds resource limit";
              else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0)
                      err_str = "Total segment size exceeds resource limit";
              PROC_UNLOCK(imgp->proc);
              if (err_str != NULL) {
                      uprintf("%s\n", err_str);
                      return (ENOMEM);
              }
      
              vmspace = imgp->proc->p_vmspace;
              vmspace->vm_tsize = text_size >> PAGE_SHIFT;
              vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
              vmspace->vm_dsize = data_size >> PAGE_SHIFT;
              vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
      
              return (0);
      }
      
      static int
      __elfN(get_interp)(struct image_params *imgp, const Elf_Phdr *phdr,
          char **interpp, bool *free_interpp)
      {
              struct thread *td;
              char *interp;
              int error, interp_name_len;
      
              KASSERT(phdr->p_type == PT_INTERP,
                  ("%s: p_type %u != PT_INTERP", __func__, phdr->p_type));
              ASSERT_VOP_LOCKED(imgp->vp, __func__);
      
              td = curthread;
      
              /* Path to interpreter */
              if (phdr->p_filesz < 2 || phdr->p_filesz > MAXPATHLEN) {
                      uprintf("Invalid PT_INTERP\n");
                      return (ENOEXEC);
              }
      
              interp_name_len = phdr->p_filesz;
              if (phdr->p_offset > PAGE_SIZE ||
                  interp_name_len > PAGE_SIZE - phdr->p_offset) {
                      /*
                       * The vnode lock might be needed by the pagedaemon to
                       * clean pages owned by the vnode.  Do not allow sleep
                       * waiting for memory with the vnode locked, instead
                       * try non-sleepable allocation first, and if it
                       * fails, go to the slow path were we drop the lock
                       * and do M_WAITOK.  A text reference prevents
                       * modifications to the vnode content.
                       */
                      interp = malloc(interp_name_len + 1, M_TEMP, M_NOWAIT);
                      if (interp == NULL) {
                              VOP_UNLOCK(imgp->vp);
                              interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK);
                              vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
                      }
      
                      error = vn_rdwr(UIO_READ, imgp->vp, interp,
                          interp_name_len, phdr->p_offset,
                          UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
                          NOCRED, NULL, td);
                      if (error != 0) {
                              free(interp, M_TEMP);
                              uprintf("i/o error PT_INTERP %d\n", error);
                              return (error);
                      }
                      interp[interp_name_len] = '\0';
      
                      *interpp = interp;
                      *free_interpp = true;
                      return (0);
              }
      
              interp = __DECONST(char *, imgp->image_header) + phdr->p_offset;
              if (interp[interp_name_len - 1] != '\0') {
                      uprintf("Invalid PT_INTERP\n");
                      return (ENOEXEC);
              }
      
              *interpp = interp;
              *free_interpp = false;
              return (0);
      }
      
      static int
      __elfN(load_interp)(struct image_params *imgp, const Elf_Brandinfo *brand_info,
          const char *interp, u_long *addr, u_long *entry)
      {
              char *path;
              int error;
      
              if (brand_info->emul_path != NULL &&
                  brand_info->emul_path[0] != '\0') {
                      path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
                      snprintf(path, MAXPATHLEN, "%s%s",
                          brand_info->emul_path, interp);
                      error = __elfN(load_file)(imgp->proc, path, addr, entry);
                      free(path, M_TEMP);
                      if (error == 0)
                              return (0);
              }
      
              if (brand_info->interp_newpath != NULL &&
                  (brand_info->interp_path == NULL ||
                  strcmp(interp, brand_info->interp_path) == 0)) {
                      error = __elfN(load_file)(imgp->proc,
                          brand_info->interp_newpath, addr, entry);
                      if (error == 0)
                              return (0);
              }
      
              error = __elfN(load_file)(imgp->proc, interp, addr, entry);
              if (error == 0)
                      return (0);
      
              uprintf("ELF interpreter %s not found, error %d\n", interp, error);
              return (error);
      }
      
      /*
       * Impossible et_dyn_addr initial value indicating that the real base
       * must be calculated later with some randomization applied.
       */
      #define        ET_DYN_ADDR_RAND        1
      
      static int
      __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
   17 {
              struct thread *td;
              const Elf_Ehdr *hdr;
              const Elf_Phdr *phdr;
              Elf_Auxargs *elf_auxargs;
              struct vmspace *vmspace;
              vm_map_t map;
              char *interp;
              Elf_Brandinfo *brand_info;
              struct sysentvec *sv;
              u_long addr, baddr, et_dyn_addr, entry, proghdr;
              u_long maxalign, mapsz, maxv, maxv1;
              uint32_t fctl0;
              int32_t osrel;
              bool free_interp;
              int error, i, n;
      
              hdr = (const Elf_Ehdr *)imgp->image_header;
      
              /*
               * Do we have a valid ELF header ?
               *
               * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
               * if particular brand doesn't support it.
               */
   17         if (__elfN(check_header)(hdr) != 0 ||
                  (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
                      return (-1);
      
              /*
               * From here on down, we return an errno, not -1, as we've
               * detected an ELF file.
               */
      
              if (!__elfN(phdr_in_zero_page)(hdr)) {
                      uprintf("Program headers not in the first page\n");
                      return (ENOEXEC);
              }
              phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); 
              if (!aligned(phdr, Elf_Addr)) {
                      uprintf("Unaligned program headers\n");
                      return (ENOEXEC);
              }
      
              n = error = 0;
              baddr = 0;
              osrel = 0;
              fctl0 = 0;
              entry = proghdr = 0;
              interp = NULL;
              free_interp = false;
              td = curthread;
              maxalign = PAGE_SIZE;
              mapsz = 0;
      
              for (i = 0; i < hdr->e_phnum; i++) {