/*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 2002 Poul-Henning Kamp
       * Copyright (c) 2002 Networks Associates Technology, Inc.
       * Copyright (c) 2013 The FreeBSD Foundation
       * All rights reserved.
       *
       * This software was developed for the FreeBSD Project by Poul-Henning Kamp
       * and NAI Labs, the Security Research Division of Network Associates, Inc.
       * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
       * DARPA CHATS research program.
       *
       * Portions of this software were developed by Konstantin Belousov
       * under sponsorship from the FreeBSD Foundation.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. The names of the authors may not be used to endorse or promote
       *    products derived from this software without specific prior written
       *    permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/malloc.h>
      #include <sys/bio.h>
      #include <sys/ktr.h>
      #include <sys/proc.h>
      #include <sys/sbuf.h>
      #include <sys/stack.h>
      #include <sys/sysctl.h>
      #include <sys/vmem.h>
      #include <machine/stdarg.h>
      
      #include <sys/errno.h>
      #include <geom/geom.h>
      #include <geom/geom_int.h>
      #include <sys/devicestat.h>
      
      #include <vm/uma.h>
      #include <vm/vm.h>
      #include <vm/vm_param.h>
      #include <vm/vm_kern.h>
      #include <vm/vm_page.h>
      #include <vm/vm_object.h>
      #include <vm/vm_extern.h>
      #include <vm/vm_map.h>
      
      static int        g_io_transient_map_bio(struct bio *bp);
      
      static struct g_bioq g_bio_run_down;
      static struct g_bioq g_bio_run_up;
      
      /*
       * Pace is a hint that we've had some trouble recently allocating
       * bios, so we should back off trying to send I/O down the stack
       * a bit to let the problem resolve. When pacing, we also turn
       * off direct dispatch to also reduce memory pressure from I/Os
       * there, at the expxense of some added latency while the memory
       * pressures exist. See g_io_schedule_down() for more details
       * and limitations.
       */
      static volatile u_int __read_mostly pace;
      
      static uma_zone_t __read_mostly biozone;
      
      #include <machine/atomic.h>
      
      static void
      g_bioq_lock(struct g_bioq *bq)
      {
      
              mtx_lock(&bq->bio_queue_lock);
      }
      
      static void
      g_bioq_unlock(struct g_bioq *bq)
      {
      
              mtx_unlock(&bq->bio_queue_lock);
      }
      
      #if 0
      static void
      g_bioq_destroy(struct g_bioq *bq)
      {
      
              mtx_destroy(&bq->bio_queue_lock);
      }
      #endif
      
      static void
      g_bioq_init(struct g_bioq *bq)
      {
      
              TAILQ_INIT(&bq->bio_queue);
              mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
      }
      
      static struct bio *
      g_bioq_first(struct g_bioq *bq)
      {
              struct bio *bp;
      
              bp = TAILQ_FIRST(&bq->bio_queue);
              if (bp != NULL) {
                      KASSERT((bp->bio_flags & BIO_ONQUEUE),
                          ("Bio not on queue bp=%p target %p", bp, bq));
                      bp->bio_flags &= ~BIO_ONQUEUE;
                      TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
                      bq->bio_queue_length--;
              }
              return (bp);
      }
      
      struct bio *
      g_new_bio(void)
      {
              struct bio *bp;
      
              bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
      #ifdef KTR
              if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
                      struct stack st;
      
                      CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
                      stack_save(&st);
                      CTRSTACK(KTR_GEOM, &st, 3);
              }
      #endif
              return (bp);
      }
      
      struct bio *
      g_alloc_bio(void)
 3719 {
              struct bio *bp;
      
              bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
      #ifdef KTR
              if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
                      struct stack st;
      
                      CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
                      stack_save(&st);
                      CTRSTACK(KTR_GEOM, &st, 3);
              }
      #endif
              return (bp);
      }
      
      void
      g_destroy_bio(struct bio *bp)
    4 {
      #ifdef KTR
              if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
                      struct stack st;
      
                      CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
                      stack_save(&st);
                      CTRSTACK(KTR_GEOM, &st, 3);
              }
      #endif
              uma_zfree(biozone, bp);
      }
      
      struct bio *
      g_clone_bio(struct bio *bp)
 3719 {
              struct bio *bp2;
      
              bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
              if (bp2 != NULL) {
                      bp2->bio_parent = bp;
                      bp2->bio_cmd = bp->bio_cmd;
                      /*
                       *  BIO_ORDERED flag may be used by disk drivers to enforce
                       *  ordering restrictions, so this flag needs to be cloned.
                       *  BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
                       *  indicate which way the buffer is passed.
                       *  Other bio flags are not suitable for cloning.
                       */
                      bp2->bio_flags = bp->bio_flags &
                          (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
                      bp2->bio_length = bp->bio_length;
                      bp2->bio_offset = bp->bio_offset;
                      bp2->bio_data = bp->bio_data;
                      bp2->bio_ma = bp->bio_ma;
                      bp2->bio_ma_n = bp->bio_ma_n;
                      bp2->bio_ma_offset = bp->bio_ma_offset;
                      bp2->bio_attribute = bp->bio_attribute;
 3719                 if (bp->bio_cmd == BIO_ZONE)
                              bcopy(&bp->bio_zone, &bp2->bio_zone,
                                  sizeof(bp->bio_zone));
      #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
                      bp2->bio_track_bp = bp->bio_track_bp;
      #endif
                      bp->bio_children++;
              }
      #ifdef KTR
              if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
                      struct stack st;
      
                      CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
                      stack_save(&st);
                      CTRSTACK(KTR_GEOM, &st, 3);
              }
      #endif
              return(bp2);
      }
      
      struct bio *
      g_duplicate_bio(struct bio *bp)
      {
              struct bio *bp2;
      
              bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
              bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
              bp2->bio_parent = bp;
              bp2->bio_cmd = bp->bio_cmd;
              bp2->bio_length = bp->bio_length;
              bp2->bio_offset = bp->bio_offset;
              bp2->bio_data = bp->bio_data;
              bp2->bio_ma = bp->bio_ma;
              bp2->bio_ma_n = bp->bio_ma_n;
              bp2->bio_ma_offset = bp->bio_ma_offset;
              bp2->bio_attribute = bp->bio_attribute;
              bp->bio_children++;
      #ifdef KTR
              if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
                      struct stack st;
      
                      CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
                      stack_save(&st);
                      CTRSTACK(KTR_GEOM, &st, 3);
              }
      #endif
              return(bp2);
      }
      
      void
      g_reset_bio(struct bio *bp)
      {
      
              bzero(bp, sizeof(*bp));
      }
      
      void
      g_io_init()
      {
      
              g_bioq_init(&g_bio_run_down);
              g_bioq_init(&g_bio_run_up);
              biozone = uma_zcreate("g_bio", sizeof (struct bio),
                  NULL, NULL,
                  NULL, NULL,
                  0, 0);
      }
      
      int
      g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
      {
              struct bio *bp;
              int error;
      
              g_trace(G_T_BIO, "bio_getattr(%s)", attr);
              bp = g_alloc_bio();
              bp->bio_cmd = BIO_GETATTR;
              bp->bio_done = NULL;
              bp->bio_attribute = attr;
              bp->bio_length = *len;
              bp->bio_data = ptr;
              g_io_request(bp, cp);
              error = biowait(bp, "ggetattr");
              *len = bp->bio_completed;
              g_destroy_bio(bp);
              return (error);
      }
      
      int
      g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
      {
              struct bio *bp;
              int error;
              
              g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
              bp = g_alloc_bio();
              bp->bio_cmd = BIO_ZONE;
              bp->bio_done = NULL;
              /*
               * XXX KDM need to handle report zone data.
               */
              bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
              if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
                      bp->bio_length =
                          zone_args->zone_params.report.entries_allocated *
                          sizeof(struct disk_zone_rep_entry);
              else
                      bp->bio_length = 0;
      
              g_io_request(bp, cp);
              error = biowait(bp, "gzone");
              bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
              g_destroy_bio(bp);
              return (error);
      }
      
      /*
       * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that
       * the upper layers have detected a resource shortage. The lower layers are
       * advised to stop delaying I/O that they might be holding for performance
       * reasons and to schedule it (non-trims) or complete it successfully (trims) as
       * quickly as it can. bio_length is the amount of the shortage.  This call
       * should be non-blocking. bio_resid is used to communicate back if the lower
       * layers couldn't find bio_length worth of I/O to schedule or discard. A length
       * of 0 means to do as much as you can (schedule the h/w queues full, discard
       * all trims). flags are a hint from the upper layers to the lower layers what
       * operation should be done.
       */
      int
      g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp)
      {
              struct bio *bp;
              int error;
      
              KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0,
                  ("Invalid flags passed to g_io_speedup: %#x", flags));
              g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name,
                  shortage, flags);
              bp = g_new_bio();
              if (bp == NULL)
                      return (ENOMEM);
              bp->bio_cmd = BIO_SPEEDUP;
              bp->bio_length = shortage;
              bp->bio_done = NULL;
              bp->bio_flags |= flags;
              g_io_request(bp, cp);
              error = biowait(bp, "gflush");
              *resid = bp->bio_resid;
              g_destroy_bio(bp);
              return (error);
      }
      
      int
      g_io_flush(struct g_consumer *cp)
      {
              struct bio *bp;
              int error;
      
              g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
              bp = g_alloc_bio();
              bp->bio_cmd = BIO_FLUSH;
              bp->bio_flags |= BIO_ORDERED;
              bp->bio_done = NULL;
              bp->bio_attribute = NULL;
              bp->bio_offset = cp->provider->mediasize;
              bp->bio_length = 0;
              bp->bio_data = NULL;
              g_io_request(bp, cp);
              error = biowait(bp, "gflush");
              g_destroy_bio(bp);
              return (error);
      }
      
      static int
      g_io_check(struct bio *bp)
 3719 {
              struct g_consumer *cp;
              struct g_provider *pp;
              off_t excess;
              int error;
      
 3719         biotrack(bp, __func__);
      
              cp = bp->bio_from;
              pp = bp->bio_to;
      
              /* Fail if access counters dont allow the operation */
 3719         switch(bp->bio_cmd) {
              case BIO_READ:
              case BIO_GETATTR:
   59                 if (cp->acr == 0)
                              return (EPERM);
                      break;
              case BIO_WRITE:
              case BIO_DELETE:
              case BIO_SPEEDUP:
              case BIO_FLUSH:
 3696                 if (cp->acw == 0)
                              return (EPERM);
                      break;
              case BIO_ZONE:
                      if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
                          (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
                              if (cp->acr == 0)
                                      return (EPERM);
                      } else if (cp->acw == 0)
                              return (EPERM);
                      break;
              default:
                      return (EPERM);
              }
              /* if provider is marked for error, don't disturb. */
              if (pp->error)
                      return (pp->error);
              if (cp->flags & G_CF_ORPHAN)
                      return (ENXIO);
      
              switch(bp->bio_cmd) {
              case BIO_READ:
              case BIO_WRITE:
              case BIO_DELETE:
                      /* Zero sectorsize or mediasize is probably a lack of media. */
                      if (pp->sectorsize == 0 || pp->mediasize == 0)
                              return (ENXIO);
                      /* Reject I/O not on sector boundary */
                      if (bp->bio_offset % pp->sectorsize)
                              return (EINVAL);
                      /* Reject I/O not integral sector long */
                      if (bp->bio_length % pp->sectorsize)
                              return (EINVAL);
                      /* Reject requests before or past the end of media. */
                      if (bp->bio_offset < 0)
                              return (EIO);
                      if (bp->bio_offset > pp->mediasize)
                              return (EIO);
      
                      /* Truncate requests to the end of providers media. */
                      excess = bp->bio_offset + bp->bio_length;
 3719                 if (excess > bp->bio_to->mediasize) {
                              KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
                                  round_page(bp->bio_ma_offset +
                                  bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
                                  ("excess bio %p too short", bp));
                              excess -= bp->bio_to->mediasize;
                              bp->bio_length -= excess;
                              if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
                                      bp->bio_ma_n = round_page(bp->bio_ma_offset +
                                          bp->bio_length) / PAGE_SIZE;
                              }
                              if (excess > 0)
                                      CTR3(KTR_GEOM, "g_down truncated bio "
                                          "%p provider %s by %d", bp,
                                          bp->bio_to->name, excess);
                      }
      
                      /* Deliver zero length transfers right here. */
    4                 if (bp->bio_length == 0) {
                              CTR2(KTR_GEOM, "g_down terminated 0-length "
                                  "bp %p provider %s", bp, bp->bio_to->name);
                              return (0);
                      }
      
 3522                 if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 3500                     (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
                          (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
                              if ((error = g_io_transient_map_bio(bp)) >= 0)
                                      return (error);
                      }
                      break;
              default:
                      break;
              }
              return (EJUSTRETURN);
      }
      
      void
      g_io_request(struct bio *bp, struct g_consumer *cp)
 3719 {
              struct g_provider *pp;
              int direct, error, first;
              uint8_t cmd;
      
              biotrack(bp, __func__);
      
              KASSERT(cp != NULL, ("NULL cp in g_io_request"));
              KASSERT(bp != NULL, ("NULL bp in g_io_request"));
              pp = cp->provider;
              KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
      #ifdef DIAGNOSTIC
              KASSERT(bp->bio_driver1 == NULL,
                  ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
              KASSERT(bp->bio_driver2 == NULL,
                  ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
              KASSERT(bp->bio_pflags == 0,
                  ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
              /*
               * Remember consumer's private fields, so we can detect if they were
               * modified by the provider.
               */
              bp->_bio_caller1 = bp->bio_caller1;
              bp->_bio_caller2 = bp->bio_caller2;
              bp->_bio_cflags = bp->bio_cflags;
      #endif
      
              cmd = bp->bio_cmd;
 3719         if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
                      KASSERT(bp->bio_data != NULL,
                          ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
              }
 3719         if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
                      KASSERT(bp->bio_data == NULL,
                          ("non-NULL bp->data in g_io_request(cmd=%hu)",
                          bp->bio_cmd));
              }
              if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
                      KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
                          ("wrong offset %jd for sectorsize %u",
                          bp->bio_offset, cp->provider->sectorsize));
 3719                 KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
                          ("wrong length %jd for sectorsize %u",
                          bp->bio_length, cp->provider->sectorsize));
              }
      
 3719         g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
                  bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
      
              bp->bio_from = cp;
              bp->bio_to = pp;
              bp->bio_error = 0;
              bp->bio_completed = 0;
      
              KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
                  ("Bio already on queue bp=%p", bp));
      
 3719         if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
                  ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
                      binuptime(&bp->bio_t0);
              else
                      getbinuptime(&bp->bio_t0);
 3719         if (g_collectstats & G_STATS_CONSUMERS)
                      devstat_start_transaction(cp->stat, &bp->bio_t0);
              if (g_collectstats & G_STATS_PROVIDERS)
 3719                 devstat_start_transaction(pp->stat, &bp->bio_t0);
      #ifdef INVARIANTS
              atomic_add_int(&cp->nstart, 1);
      #endif
      
      #ifdef GET_STACK_USAGE
              direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
                  (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
                  !g_is_geom_thread(curthread) &&
 3719             ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
                  (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
                  pace == 0;
              if (direct) {
                      /* Block direct execution if less then half of stack left. */
                      size_t        st, su;
                      GET_STACK_USAGE(st, su);
                      if (su * 2 > st)
                              direct = 0;
              }
      #else
              direct = 0;
      #endif
      
              if (direct) {
                      error = g_io_check(bp);
                      if (error >= 0) {
                              CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
                                  "provider %s returned %d", bp, bp->bio_to->name,
                                  error);
    4                         g_io_deliver(bp, error);
                              return;
                      }
 3719                 bp->bio_to->geom->start(bp);
              } else {
                      g_bioq_lock(&g_bio_run_down);
                      first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
                      TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
                      bp->bio_flags |= BIO_ONQUEUE;
                      g_bio_run_down.bio_queue_length++;
                      g_bioq_unlock(&g_bio_run_down);
                      /* Pass it on down. */
                      if (first)
                              wakeup(&g_wait_down);
              }
      }
      
      void
      g_io_deliver(struct bio *bp, int error)
    4 {
              struct bintime now;
              struct g_consumer *cp;
              struct g_provider *pp;
              struct mtx *mtxp;
              int direct, first;
      
    4         biotrack(bp, __func__);
      
              KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
              pp = bp->bio_to;
              KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
              cp = bp->bio_from;
              if (cp == NULL) {
                      bp->bio_error = error;
                      bp->bio_done(bp);
                      return;
              }
              KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
              KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
      #ifdef DIAGNOSTIC
              /*
               * Some classes - GJournal in particular - can modify bio's
               * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
               * flag means it's an expected behaviour for that particular geom.
               */
              if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
                      KASSERT(bp->bio_caller1 == bp->_bio_caller1,
                          ("bio_caller1 used by the provider %s", pp->name));
                      KASSERT(bp->bio_caller2 == bp->_bio_caller2,
                          ("bio_caller2 used by the provider %s", pp->name));
    4                 KASSERT(bp->bio_cflags == bp->_bio_cflags,
                          ("bio_cflags used by the provider %s", pp->name));
              }
      #endif
              KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
              KASSERT(bp->bio_completed <= bp->bio_length,
                  ("bio_completed can't be greater than bio_length"));
      
    4         g_trace(G_T_BIO,
      "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
                  bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
                  (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
      
              KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
                  ("Bio already on queue bp=%p", bp));
      
              /*
               * XXX: next two doesn't belong here
               */
              bp->bio_bcount = bp->bio_length;
              bp->bio_resid = bp->bio_bcount - bp->bio_completed;
      
      #ifdef GET_STACK_USAGE
              direct = (pp->flags & G_PF_DIRECT_SEND) &&
                       (cp->flags & G_CF_DIRECT_RECEIVE) &&
                       !g_is_geom_thread(curthread);
              if (direct) {
                      /* Block direct execution if less then half of stack left. */
                      size_t        st, su;
                      GET_STACK_USAGE(st, su);
                      if (su * 2 > st)
                              direct = 0;
              }
      #else
              direct = 0;
      #endif
      
              /*
               * The statistics collection is lockless, as such, but we
               * can not update one instance of the statistics from more
               * than one thread at a time, so grab the lock first.
               */
    4         if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
                  ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
                      binuptime(&now);
              mtxp = mtx_pool_find(mtxpool_sleep, cp);
              mtx_lock(mtxp);
              if (g_collectstats & G_STATS_PROVIDERS)
    4                 devstat_end_transaction_bio_bt(pp->stat, bp, &now);
    4         if (g_collectstats & G_STATS_CONSUMERS)
                      devstat_end_transaction_bio_bt(cp->stat, bp, &now);
      #ifdef INVARIANTS
              cp->nend++;
      #endif
              mtx_unlock(mtxp);
      
              if (error != ENOMEM) {
                      bp->bio_error = error;
                      if (direct) {
    4                         biodone(bp);
                      } else {
                              g_bioq_lock(&g_bio_run_up);
                              first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
                              TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
                              bp->bio_flags |= BIO_ONQUEUE;
                              g_bio_run_up.bio_queue_length++;
                              g_bioq_unlock(&g_bio_run_up);
                              if (first)
                                      wakeup(&g_wait_up);
                      }
                      return;
              }
      
              if (bootverbose)
                      printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
              bp->bio_children = 0;
              bp->bio_inbed = 0;
              bp->bio_driver1 = NULL;
              bp->bio_driver2 = NULL;
              bp->bio_pflags = 0;
              g_io_request(bp, cp);
              pace = 1;
              return;
      }
      
      SYSCTL_DECL(_kern_geom);
      
      static long transient_maps;
      SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
          &transient_maps, 0,
          "Total count of the transient mapping requests");
      u_int transient_map_retries = 10;
      SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
          &transient_map_retries, 0,
          "Max count of retries used before giving up on creating transient map");
      int transient_map_hard_failures;
      SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
          &transient_map_hard_failures, 0,
          "Failures to establish the transient mapping due to retry attempts "
          "exhausted");
      int transient_map_soft_failures;
      SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
          &transient_map_soft_failures, 0,
          "Count of retried failures to establish the transient mapping");
      int inflight_transient_maps;
      SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
          &inflight_transient_maps, 0,
          "Current count of the active transient maps");
      
      static int
      g_io_transient_map_bio(struct bio *bp)
      {
              vm_offset_t addr;
              long size;
              u_int retried;
      
              KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
      
              size = round_page(bp->bio_ma_offset + bp->bio_length);
              KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
              addr = 0;
              retried = 0;
              atomic_add_long(&transient_maps, 1);
      retry:
              if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
                      if (transient_map_retries != 0 &&
                          retried >= transient_map_retries) {
                              CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
                                  bp, bp->bio_to->name);
                              atomic_add_int(&transient_map_hard_failures, 1);
                              return (EDEADLK/* XXXKIB */);
                      } else {
                              /*
                               * Naive attempt to quisce the I/O to get more
                               * in-flight requests completed and defragment
                               * the transient_arena.
                               */
                              CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
                                  bp, bp->bio_to->name, retried);
                              pause("g_d_tra", hz / 10);
                              retried++;
                              atomic_add_int(&transient_map_soft_failures, 1);
                              goto retry;
                      }
              }
              atomic_add_int(&inflight_transient_maps, 1);
              pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
              bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
              bp->bio_flags |= BIO_TRANSIENT_MAPPING;
              bp->bio_flags &= ~BIO_UNMAPPED;
              return (EJUSTRETURN);
      }
      
      void
      g_io_schedule_down(struct thread *tp __unused)
      {
              struct bio *bp;
              int error;
      
              for(;;) {
                      g_bioq_lock(&g_bio_run_down);
                      bp = g_bioq_first(&g_bio_run_down);
                      if (bp == NULL) {
                              CTR0(KTR_GEOM, "g_down going to sleep");
                              msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
                                  PRIBIO | PDROP, "-", 0);
                              continue;
                      }
                      CTR0(KTR_GEOM, "g_down has work to do");
                      g_bioq_unlock(&g_bio_run_down);
                      biotrack(bp, __func__);
                      if (pace != 0) {
                              /*
                               * There has been at least one memory allocation
                               * failure since the last I/O completed. Pause 1ms to
                               * give the system a chance to free up memory. We only
                               * do this once because a large number of allocations
                               * can fail in the direct dispatch case and there's no
                               * relationship between the number of these failures and
                               * the length of the outage. If there's still an outage,
                               * we'll pause again and again until it's
                               * resolved. Older versions paused longer and once per
                               * allocation failure. This was OK for a single threaded
                               * g_down, but with direct dispatch would lead to max of
                               * 10 IOPs for minutes at a time when transient memory
                               * issues prevented allocation for a batch of requests
                               * from the upper layers.
                               *
                               * XXX This pacing is really lame. It needs to be solved
                               * by other methods. This is OK only because the worst
                               * case scenario is so rare. In the worst case scenario
                               * all memory is tied up waiting for I/O to complete
                               * which can never happen since we can't allocate bios
                               * for that I/O.
                               */
                              CTR0(KTR_GEOM, "g_down pacing self");
                              pause("g_down", min(hz/1000, 1));
                              pace = 0;
                      }
                      CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
                          bp->bio_to->name);
                      error = g_io_check(bp);
                      if (error >= 0) {
                              CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
                                  "%s returned %d", bp, bp->bio_to->name, error);
                              g_io_deliver(bp, error);
                              continue;
                      }
                      THREAD_NO_SLEEPING();
                      CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
                          "len %ld", bp, bp->bio_to->name, bp->bio_offset,
                          bp->bio_length);
                      bp->bio_to->geom->start(bp);
                      THREAD_SLEEPING_OK();
              }
      }
      
      void
      g_io_schedule_up(struct thread *tp __unused)
      {
              struct bio *bp;
      
              for(;;) {
                      g_bioq_lock(&g_bio_run_up);
                      bp = g_bioq_first(&g_bio_run_up);
                      if (bp == NULL) {
                              CTR0(KTR_GEOM, "g_up going to sleep");
                              msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
                                  PRIBIO | PDROP, "-", 0);
                              continue;
                      }
                      g_bioq_unlock(&g_bio_run_up);
                      THREAD_NO_SLEEPING();
                      CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
                          "%jd len %ld", bp, bp->bio_to->name,
                          bp->bio_offset, bp->bio_length);
                      biodone(bp);
                      THREAD_SLEEPING_OK();
              }
      }
      
      void *
      g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
      {
              struct bio *bp;
              void *ptr;
              int errorc;
      
              KASSERT(length > 0 && length >= cp->provider->sectorsize &&
                  length <= MAXPHYS, ("g_read_data(): invalid length %jd",
                  (intmax_t)length));
      
              bp = g_alloc_bio();
              bp->bio_cmd = BIO_READ;
              bp->bio_done = NULL;
              bp->bio_offset = offset;
              bp->bio_length = length;
              ptr = g_malloc(length, M_WAITOK);
              bp->bio_data = ptr;
              g_io_request(bp, cp);
              errorc = biowait(bp, "gread");
              if (error != NULL)
                      *error = errorc;
              g_destroy_bio(bp);
              if (errorc) {
                      g_free(ptr);
                      ptr = NULL;
              }
              return (ptr);
      }
      
      /*
       * A read function for use by ffs_sbget when used by GEOM-layer routines.
       */
      int
      g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size)
      {
              struct g_consumer *cp;
      
              KASSERT(*bufp == NULL,
                  ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp));
      
              cp = (struct g_consumer *)devfd;
              /*
               * Take care not to issue an invalid I/O request. The offset of
               * the superblock candidate must be multiples of the provider's
               * sector size, otherwise an FFS can't exist on the provider
               * anyway.
               */
              if (loc % cp->provider->sectorsize != 0)
                      return (ENOENT);
              *bufp = g_read_data(cp, loc, size, NULL);
              if (*bufp == NULL)
                      return (ENOENT);
              return (0);
      }
      
      int
      g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
      {
              struct bio *bp;
              int error;
      
              KASSERT(length > 0 && length >= cp->provider->sectorsize &&
                  length <= MAXPHYS, ("g_write_data(): invalid length %jd",
                  (intmax_t)length));
      
              bp = g_alloc_bio();
              bp->bio_cmd = BIO_WRITE;
              bp->bio_done = NULL;
              bp->bio_offset = offset;
              bp->bio_length = length;
              bp->bio_data = ptr;
              g_io_request(bp, cp);
              error = biowait(bp, "gwrite");
              g_destroy_bio(bp);
              return (error);
      }
      
      /*
       * A write function for use by ffs_sbput when used by GEOM-layer routines.
       */
      int
      g_use_g_write_data(void *devfd, off_t loc, void *buf, int size)
      {
      
              return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
      }
      
      int
      g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
      {
              struct bio *bp;
              int error;
      
              KASSERT(length > 0 && length >= cp->provider->sectorsize,
                  ("g_delete_data(): invalid length %jd", (intmax_t)length));
      
              bp = g_alloc_bio();
              bp->bio_cmd = BIO_DELETE;
              bp->bio_done = NULL;
              bp->bio_offset = offset;
              bp->bio_length = length;
              bp->bio_data = NULL;
              g_io_request(bp, cp);
              error = biowait(bp, "gdelete");
              g_destroy_bio(bp);
              return (error);
      }
      
      void
      g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix,
          ...)
      {
      #ifndef PRINTF_BUFR_SIZE
      #define PRINTF_BUFR_SIZE 64
      #endif
              char bufr[PRINTF_BUFR_SIZE];
              struct sbuf sb, *sbp __unused;
              va_list ap;
      
              sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN);
              KASSERT(sbp != NULL, ("sbuf_new misused?"));
      
              sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
      
              sbuf_cat(&sb, prefix);
              g_format_bio(&sb, bp);
      
              va_start(ap, fmtsuffix);
              sbuf_vprintf(&sb, fmtsuffix, ap);
              va_end(ap);
      
              sbuf_nl_terminate(&sb);
      
              sbuf_finish(&sb);
              sbuf_delete(&sb);
      }
      
      void
      g_format_bio(struct sbuf *sb, const struct bio *bp)
      {
              const char *pname, *cmd = NULL;
      
              if (bp->bio_to != NULL)
                      pname = bp->bio_to->name;
              else
                      pname = "[unknown]";
      
              switch (bp->bio_cmd) {
              case BIO_GETATTR:
                      cmd = "GETATTR";
                      sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd,
                          bp->bio_attribute);
                      return;
              case BIO_FLUSH:
                      cmd = "FLUSH";
                      sbuf_printf(sb, "%s[%s]", pname, cmd);
                      return;
              case BIO_ZONE: {
                      char *subcmd = NULL;
                      cmd = "ZONE";
                      switch (bp->bio_zone.zone_cmd) {
                      case DISK_ZONE_OPEN:
                              subcmd = "OPEN";
                              break;
                      case DISK_ZONE_CLOSE:
                              subcmd = "CLOSE";
                              break;
                      case DISK_ZONE_FINISH:
                              subcmd = "FINISH";
                              break;
                      case DISK_ZONE_RWP:
                              subcmd = "RWP";
                              break;
                      case DISK_ZONE_REPORT_ZONES:
                              subcmd = "REPORT ZONES";
                              break;
                      case DISK_ZONE_GET_PARAMS:
                              subcmd = "GET PARAMS";
                              break;
                      default:
                              subcmd = "UNKNOWN";
                              break;
                      }
                      sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd);
                      return;
              }
              case BIO_READ:
                      cmd = "READ";
                      break;
              case BIO_WRITE:
                      cmd = "WRITE";
                      break;
              case BIO_DELETE:
                      cmd = "DELETE";
                      break;
              default:
                      cmd = "UNKNOWN";
                      sbuf_printf(sb, "%s[%s()]", pname, cmd);
                      return;
              }
              sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd,
                  (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
      }
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1982, 1986, 1989, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * This code is derived from software contributed to Berkeley by
       * Mike Karels at Berkeley Software Design, Inc.
       *
       * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
       * project, to make these variables more userfriendly.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)kern_sysctl.c        8.4 (Berkeley) 4/14/94
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_posix.h"
      #include "opt_config.h"
      
      #include <sys/param.h>
      #include <sys/boot.h>
      #include <sys/jail.h>
      #include <sys/kernel.h>
      #include <sys/limits.h>
      #include <sys/lock.h>
      #include <sys/mutex.h>
      #include <sys/proc.h>
      #include <sys/random.h>
      #include <sys/sbuf.h>
      #include <sys/smp.h>
      #include <sys/sx.h>
      #include <sys/vmmeter.h>
      #include <sys/sysctl.h>
      #include <sys/systm.h>
      #include <sys/unistd.h>
      
      SYSCTL_ROOT_NODE(0, sysctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Sysctl internal magic");
      SYSCTL_ROOT_NODE(CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, 0,
          "High kernel, proc, limits &c");
      SYSCTL_ROOT_NODE(CTL_VM, vm, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Virtual memory");
      SYSCTL_ROOT_NODE(CTL_VFS, vfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "File system");
      SYSCTL_ROOT_NODE(CTL_NET, net, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Network, (see socket.h)");
      SYSCTL_ROOT_NODE(CTL_DEBUG, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Debugging");
      SYSCTL_NODE(_debug, OID_AUTO,  sizeof,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Sizeof various things");
      SYSCTL_ROOT_NODE(CTL_HW, hw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "hardware");
      SYSCTL_ROOT_NODE(CTL_MACHDEP, machdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "machine dependent");
      SYSCTL_NODE(_machdep, OID_AUTO, mitigations, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Machine dependent platform mitigations.");
      SYSCTL_ROOT_NODE(CTL_USER, user, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "user-level");
      SYSCTL_ROOT_NODE(CTL_P1003_1B, p1003_1b, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "p1003_1b, (see p1003_1b.h)");
      
      SYSCTL_ROOT_NODE(OID_AUTO, compat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Compatibility code");
      SYSCTL_ROOT_NODE(OID_AUTO, security, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 
          "Security");
      #ifdef REGRESSION
      SYSCTL_ROOT_NODE(OID_AUTO, regression, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Regression test MIB");
      #endif
      
      SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD|CTLFLAG_MPSAFE,
          kern_ident, 0, "Kernel identifier");
      
      SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, BSD, "Operating system revision");
      
      SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD|CTLFLAG_MPSAFE,
          version, 0, "Kernel version");
      
      SYSCTL_STRING(_kern, OID_AUTO, compiler_version, CTLFLAG_RD|CTLFLAG_MPSAFE,
          compiler_version, 0, "Version of compiler used to compile kernel");
      
      SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD|CTLFLAG_MPSAFE|
          CTLFLAG_CAPRD, ostype, 0, "Operating system type");
      
      SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
          &maxproc, 0, "Maximum number of processes");
      
      SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
          &maxprocperuid, 0, "Maximum processes allowed per userid");
      
      SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
          &maxusers, 0, "Hint for kernel tuning");
      
      SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, ARG_MAX, "Maximum bytes of argument to execve(2)");
      
      SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, _POSIX_VERSION, "Version of POSIX attempting to comply to");
      
      SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN |
          CTLFLAG_NOFETCH | CTLFLAG_CAPRD, &ngroups_max, 0,
          "Maximum number of supplemental groups a user can belong to");
      
      SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, 1, "Whether job control is available");
      
      #ifdef _POSIX_SAVED_IDS
      SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, 1, "Whether saved set-group/user ID is available");
      #else
      SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available");
      #endif
      
      char kernelname[MAXPATHLEN] = PATH_KERNEL;        /* XXX bloat */
      
      SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW | CTLFLAG_MPSAFE,
          kernelname, sizeof kernelname, "Name of kernel file booted");
      
      SYSCTL_INT(_kern, KERN_MAXPHYS, maxphys, CTLFLAG_RD | CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, MAXPHYS, "Maximum block I/O access size");
      
      SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD|CTLFLAG_CAPRD,
          &mp_ncpus, 0, "Number of active CPUs");
      
      SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, BYTE_ORDER, "System byte order");
      
      SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD|CTLFLAG_CAPRD,
          SYSCTL_NULL_INT_PTR, PAGE_SIZE, "System memory page size");
      
      static int
      sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
      {
              char buf[256];
              size_t len;
      
              len = MIN(req->oldlen, sizeof(buf));
              read_random(buf, len);
              return (SYSCTL_OUT(req, buf, len));
      }
      
      SYSCTL_PROC(_kern, KERN_ARND, arandom,
          CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0,
          sysctl_kern_arnd, "", "arc4rand");
      
      static int
      sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
      {
              u_long val, p;
      
              p = SIZE_T_MAX >> PAGE_SHIFT;
              if (physmem < p)
                      p = physmem;
              val = ctob(p);
              return (sysctl_handle_long(oidp, &val, 0, req));
      }
      SYSCTL_PROC(_hw, HW_PHYSMEM, physmem,
          CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
          sysctl_hw_physmem, "LU",
          "Amount of physical memory (in bytes)");
      
      static int
      sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
      {
              u_long val, p;
      
              p = SIZE_T_MAX >> PAGE_SHIFT;
              if (realmem < p)
                      p = realmem;
              val = ctob(p);
              return (sysctl_handle_long(oidp, &val, 0, req));
      }
      SYSCTL_PROC(_hw, HW_REALMEM, realmem,
          CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
          sysctl_hw_realmem, "LU",
          "Amount of memory (in bytes) reported by the firmware");
      
      static int
      sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
      {
              u_long val, p, p1;
      
              p1 = physmem - vm_wire_count();
              p = SIZE_T_MAX >> PAGE_SHIFT;
              if (p1 < p)
                      p = p1;
              val = ctob(p);
              return (sysctl_handle_long(oidp, &val, 0, req));
      }
      SYSCTL_PROC(_hw, HW_USERMEM, usermem,
          CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
          sysctl_hw_usermem, "LU",
          "Amount of memory (in bytes) which is not wired");
      
      SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0,
          "Amount of physical memory (in pages)");
      
      u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE };
      
      static int
      sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS)
      {
              int error;
      #ifdef SCTL_MASK32
              int i;
              uint32_t pagesizes32[MAXPAGESIZES];
      
              if (req->flags & SCTL_MASK32) {
                      /*
                       * Recreate the "pagesizes" array with 32-bit elements.  Truncate
                       * any page size greater than UINT32_MAX to zero.
                       */
                      for (i = 0; i < MAXPAGESIZES; i++)
                              pagesizes32[i] = (uint32_t)pagesizes[i];
      
                      error = SYSCTL_OUT(req, pagesizes32, sizeof(pagesizes32));
              } else
      #endif
                      error = SYSCTL_OUT(req, pagesizes, sizeof(pagesizes));
              return (error);
      }
      SYSCTL_PROC(_hw, OID_AUTO, pagesizes,
          CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 
          sysctl_hw_pagesizes, "LU",
          "Supported page sizes");
      
      #ifdef SCTL_MASK32
      int adaptive_machine_arch = 1;
      SYSCTL_INT(_debug, OID_AUTO, adaptive_machine_arch, CTLFLAG_RW,
          &adaptive_machine_arch, 1,
          "Adapt reported machine architecture to the ABI of the binary");
      #endif
      
      static int
      sysctl_hw_machine_arch(SYSCTL_HANDLER_ARGS)
      {
              int error;
              static const char machine_arch[] = MACHINE_ARCH;
      #ifdef SCTL_MASK32
              static const char machine_arch32[] = MACHINE_ARCH32;
      
              if ((req->flags & SCTL_MASK32) != 0 && adaptive_machine_arch)
                      error = SYSCTL_OUT(req, machine_arch32, sizeof(machine_arch32));
              else
      #endif
                      error = SYSCTL_OUT(req, machine_arch, sizeof(machine_arch));
              return (error);
      
      }
      SYSCTL_PROC(_hw, HW_MACHINE_ARCH, machine_arch, CTLTYPE_STRING | CTLFLAG_RD |
          CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine_arch, "A",
          "System architecture");
      
      SYSCTL_STRING(_kern, OID_AUTO, supported_archs, CTLFLAG_RD | CTLFLAG_MPSAFE,
      #ifdef COMPAT_FREEBSD32
          MACHINE_ARCH " " MACHINE_ARCH32, 0, "Supported architectures for binaries");
      #else
          MACHINE_ARCH, 0, "Supported architectures for binaries");
      #endif
      
      static int
      sysctl_hostname(SYSCTL_HANDLER_ARGS)
      {
              struct prison *pr, *cpr;
              size_t pr_offset;
              char tmpname[MAXHOSTNAMELEN];
              int descend, error, len;
      
              /*
               * This function can set: hostname domainname hostuuid.
               * Keep that in mind when comments say "hostname".
               */
              pr_offset = (size_t)arg1;
              len = arg2;
              KASSERT(len <= sizeof(tmpname),
                  ("length %d too long for %s", len, __func__));
      
              pr = req->td->td_ucred->cr_prison;
              if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
                      return (EPERM);
              /*
               * Make a local copy of hostname to get/set so we don't have to hold
               * the jail mutex during the sysctl copyin/copyout activities.
               */
              mtx_lock(&pr->pr_mtx);
              bcopy((char *)pr + pr_offset, tmpname, len);
              mtx_unlock(&pr->pr_mtx);
      
              error = sysctl_handle_string(oidp, tmpname, len, req);
      
              if (req->newptr != NULL && error == 0) {
                      /*
                       * Copy the locally set hostname to all jails that share
                       * this host info.
                       */
                      sx_slock(&allprison_lock);
                      while (!(pr->pr_flags & PR_HOST))
                              pr = pr->pr_parent;
                      mtx_lock(&pr->pr_mtx);
    8                 bcopy(tmpname, (char *)pr + pr_offset, len);
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
                              if (cpr->pr_flags & PR_HOST)
                                      descend = 0;
                              else
                                      bcopy(tmpname, (char *)cpr + pr_offset, len);
                      mtx_unlock(&pr->pr_mtx);
                      sx_sunlock(&allprison_lock);
              }
              return (error);
      }
      
      SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
          CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE,
          (void *)(offsetof(struct prison, pr_hostname)), MAXHOSTNAMELEN,
          sysctl_hostname, "A", "Hostname");
      SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname,
          CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE,
          (void *)(offsetof(struct prison, pr_domainname)), MAXHOSTNAMELEN,
          sysctl_hostname, "A", "Name of the current YP/NIS domain");
      SYSCTL_PROC(_kern, KERN_HOSTUUID, hostuuid,
          CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE,
          (void *)(offsetof(struct prison, pr_hostuuid)), HOSTUUIDLEN,
          sysctl_hostname, "A", "Host UUID");
      
      static int        regression_securelevel_nonmonotonic = 0;
      
      #ifdef REGRESSION
      SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
          &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
      #endif
      
      static int
      sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
      {
              struct prison *pr, *cpr;
              int descend, error, level;
      
              pr = req->td->td_ucred->cr_prison;
      
              /*
               * Reading the securelevel is easy, since the current jail's level
               * is known to be at least as secure as any higher levels.  Perform
               * a lockless read since the securelevel is an integer.
               */
              level = pr->pr_securelevel;
              error = sysctl_handle_int(oidp, &level, 0, req);
              if (error || !req->newptr)
                      return (error);
              /* Permit update only if the new securelevel exceeds the old. */
              sx_slock(&allprison_lock);
              mtx_lock(&pr->pr_mtx);
              if (!regression_securelevel_nonmonotonic &&
                  level < pr->pr_securelevel) {
                      mtx_unlock(&pr->pr_mtx);
                      sx_sunlock(&allprison_lock);
                      return (EPERM);
              }
              pr->pr_securelevel = level;
              /*
               * Set all child jails to be at least this level, but do not lower
               * them (even if regression_securelevel_nonmonotonic).
               */
              FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) {
                      if (cpr->pr_securelevel < level)
                              cpr->pr_securelevel = level;
              }
              mtx_unlock(&pr->pr_mtx);
              sx_sunlock(&allprison_lock);
              return (error);
      }
      
      SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0,
          sysctl_kern_securelvl, "I",
          "Current secure level");
      
      #ifdef INCLUDE_CONFIG_FILE
      /* Actual kernel configuration options. */
      extern char kernconfstring[];
      
      SYSCTL_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD | CTLFLAG_MPSAFE,
          kernconfstring, 0, "Kernel configuration file");
      #endif
      
      static int
      sysctl_hostid(SYSCTL_HANDLER_ARGS)
      {
              struct prison *pr, *cpr;
              u_long tmpid;
              int descend, error;
      
              /*
               * Like sysctl_hostname, except it operates on a u_long
               * instead of a string, and is used only for hostid.
               */
              pr = req->td->td_ucred->cr_prison;
              if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
                      return (EPERM);
              tmpid = pr->pr_hostid;
              error = sysctl_handle_long(oidp, &tmpid, 0, req);
      
              if (req->newptr != NULL && error == 0) {
                      sx_slock(&allprison_lock);
    3                 while (!(pr->pr_flags & PR_HOST))
                              pr = pr->pr_parent;
                      mtx_lock(&pr->pr_mtx);
                      pr->pr_hostid = tmpid;
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
                              if (cpr->pr_flags & PR_HOST)
                                      descend = 0;
                              else
                                      cpr->pr_hostid = tmpid;
                      mtx_unlock(&pr->pr_mtx);
                      sx_sunlock(&allprison_lock);
              }
              return (error);
      }
      
      SYSCTL_PROC(_kern, KERN_HOSTID, hostid,
          CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
          NULL, 0, sysctl_hostid, "LU", "Host ID");
      
      static struct mtx bootid_lk;
      MTX_SYSINIT(bootid_lock, &bootid_lk, "bootid generator lock", MTX_DEF);
      
      static int
      sysctl_bootid(SYSCTL_HANDLER_ARGS)
      {
              static uint8_t boot_id[16];
              static bool initialized = false;
      
              mtx_lock(&bootid_lk);
              if (!initialized) {
                      if (!is_random_seeded()) {
                              mtx_unlock(&bootid_lk);
                              return (ENXIO);
                      }
                      arc4random_buf(boot_id, sizeof(boot_id));
                      initialized = true;
              }
              mtx_unlock(&bootid_lk);
      
              return (SYSCTL_OUT(req, boot_id, sizeof(boot_id)));
      }
      SYSCTL_PROC(_kern, OID_AUTO, boot_id,
          CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
          NULL, 0, sysctl_bootid, "", "Random boot ID");
      
      /*
       * The osrelease string is copied from the global (osrelease in vers.c) into
       * prison0 by a sysinit and is inherited by child jails if not changed at jail
       * creation, so we always return the copy from the current prison data.
       */
      static int
      sysctl_osrelease(SYSCTL_HANDLER_ARGS)
      {
              struct prison *pr;
      
              pr = req->td->td_ucred->cr_prison;
              return (SYSCTL_OUT(req, pr->pr_osrelease, strlen(pr->pr_osrelease) + 1));
      
      }
      
      SYSCTL_PROC(_kern, KERN_OSRELEASE, osrelease,
          CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
          NULL, 0, sysctl_osrelease, "A", "Operating system release");
      
      /*
       * The osreldate number is copied from the global (osreldate in vers.c) into
       * prison0 by a sysinit and is inherited by child jails if not changed at jail
       * creation, so we always return the value from the current prison data.
       */
      static int
      sysctl_osreldate(SYSCTL_HANDLER_ARGS)
      {
              struct prison *pr;
      
              pr = req->td->td_ucred->cr_prison;
              return (SYSCTL_OUT(req, &pr->pr_osreldate, sizeof(pr->pr_osreldate)));
      
      }
      
      /*
       * NOTICE: The *userland* release date is available in
       * /usr/include/osreldate.h
       */
      SYSCTL_PROC(_kern, KERN_OSRELDATE, osreldate,
          CTLTYPE_INT | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
          NULL, 0, sysctl_osreldate, "I", "Kernel release date");
      
      /*
       * The build-id is copied from the ELF section .note.gnu.build-id.  The linker
       * script defines two variables to expose the beginning and end.  LLVM
       * currently uses a SHA-1 hash, but other formats can be supported by checking
       * the length of the section.
       */
      
      extern char __build_id_start[];
      extern char __build_id_end[];
      
      #define        BUILD_ID_HEADER_LEN        0x10
      #define        BUILD_ID_HASH_MAXLEN        0x14
      
      static int
      sysctl_build_id(SYSCTL_HANDLER_ARGS)
      {
              uintptr_t sectionlen = (uintptr_t)(__build_id_end - __build_id_start);
              int hashlen;
              char buf[2*BUILD_ID_HASH_MAXLEN+1];
      
              /*
               * The ELF note section has a four byte length for the vendor name,
               * four byte length for the value, and a four byte vendor specific
               * type.  The name for the build id is "GNU\0".  We skip the first 16
               * bytes to read the build hash.  We will return the remaining bytes up
               * to 20 (SHA-1) hash size.  If the hash happens to be a custom number
               * of bytes we will pad the value with zeros, as the section should be
               * four byte aligned.
               */
              if (sectionlen <= BUILD_ID_HEADER_LEN ||
                  sectionlen > (BUILD_ID_HEADER_LEN + BUILD_ID_HASH_MAXLEN)) {
                      return (ENOENT);
              }
      
              hashlen = sectionlen - BUILD_ID_HEADER_LEN;
              for (int i = 0; i < hashlen; i++) {
                      uint8_t c = __build_id_start[i+BUILD_ID_HEADER_LEN];
                      snprintf(&buf[2*i], 3, "%02x", c);
              }
      
              return (SYSCTL_OUT(req, buf, strlen(buf) + 1));
      }
      
      SYSCTL_PROC(_kern, OID_AUTO, build_id,
          CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
          NULL, 0, sysctl_build_id, "A", "Operating system build-id");
      
      SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
          "Kernel Features");
      
      #ifdef COMPAT_FREEBSD4
      FEATURE(compat_freebsd4, "Compatible with FreeBSD 4");
      #endif
      
      #ifdef COMPAT_FREEBSD5
      FEATURE(compat_freebsd5, "Compatible with FreeBSD 5");
      #endif
      
      #ifdef COMPAT_FREEBSD6
      FEATURE(compat_freebsd6, "Compatible with FreeBSD 6");
      #endif
      
      #ifdef COMPAT_FREEBSD7
      FEATURE(compat_freebsd7, "Compatible with FreeBSD 7");
      #endif
      
      /*
       * This is really cheating.  These actually live in the libc, something
       * which I'm not quite sure is a good idea anyway, but in order for
       * getnext and friends to actually work, we define dummies here.
       *
       * XXXRW: These probably should be CTLFLAG_CAPRD.
       */
      SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
          "", 0, "PATH that finds all the standard utilities");
      SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Max ibase/obase values in bc(1)");
      SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Max array size in bc(1)");
      SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Max scale value in bc(1)");
      SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Max string length in bc(1)");
      SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
      SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "");
      SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Max length (bytes) of a text-processing utility's input line");
      SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Maximum number of repeats of a regexp permitted");
      SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0,
          "The version of POSIX 1003.2 with which the system attempts to comply");
      SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Whether C development supports the C bindings option");
      SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Whether system supports the C development utilities option");
      SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "");
      SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN development utilities");
      SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN runtime utilities");
      SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Whether system supports creation of locales");
      SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Whether system supports software development utilities");
      SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Whether system supports the user portability utilities");
      SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of streams a process may have open at one time");
      SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of types supported for timezone names");
      
      #include <sys/vnode.h>
      SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, sizeof(struct vnode), "sizeof(struct vnode)");
      
      SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, sizeof(struct proc), "sizeof(struct proc)");
      
      static int
      sysctl_kern_pid_max(SYSCTL_HANDLER_ARGS)
      {
              int error, pm;
      
              pm = pid_max;
              error = sysctl_handle_int(oidp, &pm, 0, req);
              if (error || !req->newptr)
                      return (error);
              sx_xlock(&proctree_lock);
              sx_xlock(&allproc_lock);
      
              /*
               * Only permit the values less then PID_MAX.
               * As a safety measure, do not allow to limit the pid_max too much.
               */
              if (pm < 300 || pm > PID_MAX)
                      error = EINVAL;
              else
                      pid_max = pm;
              sx_xunlock(&allproc_lock);
              sx_xunlock(&proctree_lock);
              return (error);
      }
      SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT |
          CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
          0, 0, sysctl_kern_pid_max, "I", "Maximum allowed pid");
      
      #include <sys/bio.h>
      #include <sys/buf.h>
      SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, sizeof(struct bio), "sizeof(struct bio)");
      SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, sizeof(struct buf), "sizeof(struct buf)");
      
      #include <sys/user.h>
      SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
      
      /* Used by kernel debuggers. */
      const int pcb_size = sizeof(struct pcb);
      SYSCTL_INT(_debug_sizeof, OID_AUTO, pcb, CTLFLAG_RD,
          SYSCTL_NULL_INT_PTR, sizeof(struct pcb), "sizeof(struct pcb)");
      
      /* XXX compatibility, remove for 6.0 */
      #include <sys/imgact.h>
      #include <sys/imgact_elf.h>
      SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
          &__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
          "compatibility for kern.fallback_elf_brand");
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2004 Poul-Henning Kamp
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/bio.h>
      #include <sys/kernel.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mutex.h>
      #include <sys/sbuf.h>
      #include <sys/vnode.h>
      #include <sys/mount.h>
      
      #include <geom/geom.h>
      #include <geom/geom_vfs.h>
      
      /*
       * subroutines for use by filesystems.
       *
       * XXX: should maybe live somewhere else ?
       */
      #include <sys/buf.h>
      
      struct g_vfs_softc {
              struct mtx         sc_mtx;
              struct bufobj        *sc_bo;
              int                 sc_active;
              int                 sc_orphaned;
      };
      
      static struct buf_ops __g_vfs_bufops = {
              .bop_name =        "GEOM_VFS",
              .bop_write =        bufwrite,
              .bop_strategy =        g_vfs_strategy,        
              .bop_sync =        bufsync,        
              .bop_bdflush =        bufbdflush
      };
      
      struct buf_ops *g_vfs_bufops = &__g_vfs_bufops;
      
      static g_orphan_t g_vfs_orphan;
      
      static struct g_class g_vfs_class = {
              .name =                "VFS",
              .version =        G_VERSION,
              .orphan =        g_vfs_orphan,
      };
      
      DECLARE_GEOM_CLASS(g_vfs_class, g_vfs);
      
      static void
      g_vfs_destroy(void *arg, int flags __unused)
      {
              struct g_consumer *cp;
      
              g_topology_assert();
              cp = arg;
              if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
                      g_access(cp, -cp->acr, -cp->acw, -cp->ace);
              g_detach(cp);
              if (cp->geom->softc == NULL)
                      g_wither_geom(cp->geom, ENXIO);
      }
      
      static void
      g_vfs_done(struct bio *bip)
    4 {
              struct g_consumer *cp;
              struct g_vfs_softc *sc;
              struct buf *bp;
              int destroy;
              struct mount *mp;
              struct vnode *vp;
              struct cdev *cdevp;
      
              /*
               * Collect statistics on synchronous and asynchronous read
               * and write counts for disks that have associated filesystems.
               */
              bp = bip->bio_caller2;
              vp = bp->b_vp;
              if (vp != NULL) {
                      /*
                       * If not a disk vnode, use its associated mount point
                       * otherwise use the mountpoint associated with the disk.
                       */
                      VI_LOCK(vp);
    4                 if (vp->v_type != VCHR ||
                          (cdevp = vp->v_rdev) == NULL ||
                          cdevp->si_devsw == NULL ||
                          (cdevp->si_devsw->d_flags & D_DISK) == 0)
                              mp = vp->v_mount;
                      else
                              mp = cdevp->si_mountpt;
                      if (mp != NULL) {
                              if (bp->b_iocmd == BIO_READ) {
                                      if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC)
                                              mp->mnt_stat.f_asyncreads++;
                                      else
                                              mp->mnt_stat.f_syncreads++;
                              } else if (bp->b_iocmd == BIO_WRITE) {
                                      if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC)
    3                                         mp->mnt_stat.f_asyncwrites++;
                                      else
    1                                         mp->mnt_stat.f_syncwrites++;
                              }
                      }
                      VI_UNLOCK(vp);
              }
      
              cp = bip->bio_from;
              sc = cp->geom->softc;
    4         if (bip->bio_error && bip->bio_error != EOPNOTSUPP)
                      g_print_bio("g_vfs_done():", bip, "error = %d",
                          bip->bio_error);
              bp->b_error = bip->bio_error;
              bp->b_ioflags = bip->bio_flags;
    4         if (bip->bio_error)
                      bp->b_ioflags |= BIO_ERROR;
              bp->b_resid = bp->b_bcount - bip->bio_completed;
              g_destroy_bio(bip);
      
              mtx_lock(&sc->sc_mtx);
              destroy = ((--sc->sc_active) == 0 && sc->sc_orphaned);
              mtx_unlock(&sc->sc_mtx);
    4         if (destroy)
                      g_post_event(g_vfs_destroy, cp, M_WAITOK, NULL);
      
              bufdone(bp);
      }
      
      void
      g_vfs_strategy(struct bufobj *bo, struct buf *bp)
 3719 {
              struct g_vfs_softc *sc;
              struct g_consumer *cp;
              struct bio *bip;
      
              cp = bo->bo_private;
              sc = cp->geom->softc;
      
              /*
               * If the provider has orphaned us, just return ENXIO.
               */
              mtx_lock(&sc->sc_mtx);
              if (sc->sc_orphaned) {
                      mtx_unlock(&sc->sc_mtx);
                      bp->b_error = ENXIO;
                      bp->b_ioflags |= BIO_ERROR;
                      bufdone(bp);
                      return;
              }
              sc->sc_active++;
              mtx_unlock(&sc->sc_mtx);
      
              bip = g_alloc_bio();
              bip->bio_cmd = bp->b_iocmd;
              bip->bio_offset = bp->b_iooffset;
              bip->bio_length = bp->b_bcount;
              bdata2bio(bp, bip);
 3719         if ((bp->b_flags & B_BARRIER) != 0) {
                      bip->bio_flags |= BIO_ORDERED;
                      bp->b_flags &= ~B_BARRIER;
              }
 3719         if (bp->b_iocmd == BIO_SPEEDUP)
                      bip->bio_flags |= bp->b_ioflags;
              bip->bio_done = g_vfs_done;
              bip->bio_caller2 = bp;
      #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
              buf_track(bp, __func__);
              bip->bio_track_bp = bp;
      #endif
              g_io_request(bip, cp);
      }
      
      static void
      g_vfs_orphan(struct g_consumer *cp)
      {
              struct g_geom *gp;
              struct g_vfs_softc *sc;
              int destroy;
      
              g_topology_assert();
      
              gp = cp->geom;
              g_trace(G_T_TOPOLOGY, "g_vfs_orphan(%p(%s))", cp, gp->name);
              sc = gp->softc;
              if (sc == NULL)
                      return;
              mtx_lock(&sc->sc_mtx);
              sc->sc_orphaned = 1;
              destroy = (sc->sc_active == 0);
              mtx_unlock(&sc->sc_mtx);
              if (destroy)
                      g_vfs_destroy(cp, 0);
      
              /*
               * Do not destroy the geom.  Filesystem will do that during unmount.
               */
      }
      
      int
      g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr)
      {
              struct g_geom *gp;
              struct g_provider *pp;
              struct g_consumer *cp;
              struct g_vfs_softc *sc;
              struct bufobj *bo;
              int error;
      
              g_topology_assert();
      
              *cpp = NULL;
              bo = &vp->v_bufobj;
              if (bo->bo_private != vp)
                      return (EBUSY);
      
              pp = g_dev_getprovider(vp->v_rdev);
              if (pp == NULL)
                      return (ENOENT);
              gp = g_new_geomf(&g_vfs_class, "%s.%s", fsname, pp->name);
              sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
              mtx_init(&sc->sc_mtx, "g_vfs", NULL, MTX_DEF);
              sc->sc_bo = bo;
              gp->softc = sc;
              cp = g_new_consumer(gp);
              g_attach(cp, pp);
              error = g_access(cp, 1, wr, wr);
              if (error) {
                      g_wither_geom(gp, ENXIO);
                      return (error);
              }
              vnode_create_vobject(vp, pp->mediasize, curthread);
              *cpp = cp;
              cp->private = vp;
              cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
              bo->bo_ops = g_vfs_bufops;
              bo->bo_private = cp;
              bo->bo_bsize = pp->sectorsize;
      
              return (error);
      }
      
      void
      g_vfs_close(struct g_consumer *cp)
      {
              struct g_geom *gp;
              struct g_vfs_softc *sc;
      
              g_topology_assert();
      
              gp = cp->geom;
              sc = gp->softc;
              bufobj_invalbuf(sc->sc_bo, V_SAVE, 0, 0);
              sc->sc_bo->bo_private = cp->private;
              gp->softc = NULL;
              mtx_destroy(&sc->sc_mtx);
              if (!sc->sc_orphaned || cp->provider == NULL)
                      g_wither_geom_close(gp, ENXIO);
              g_free(sc);
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright 2008-2009 Stacey Son <sson@FreeBSD.org>
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/lock.h>
      #include <sys/lockstat.h>
      #include <sys/sdt.h>
      #include <sys/time.h>
      
      SDT_PROVIDER_DEFINE(lockstat);
      
      SDT_PROBE_DEFINE1(lockstat, , , adaptive__acquire, "struct mtx *");
      SDT_PROBE_DEFINE1(lockstat, , , adaptive__release, "struct mtx *");
      SDT_PROBE_DEFINE2(lockstat, , , adaptive__spin, "struct mtx *", "uint64_t");
      SDT_PROBE_DEFINE2(lockstat, , , adaptive__block, "struct mtx *", "uint64_t");
      
      SDT_PROBE_DEFINE1(lockstat, , , spin__acquire, "struct mtx *");
      SDT_PROBE_DEFINE1(lockstat, , , spin__release, "struct mtx *");
      SDT_PROBE_DEFINE2(lockstat, , , spin__spin, "struct mtx *", "uint64_t");
      
      SDT_PROBE_DEFINE2(lockstat, , , rw__acquire, "struct rwlock *", "int");
      SDT_PROBE_DEFINE2(lockstat, , , rw__release, "struct rwlock *", "int");
      SDT_PROBE_DEFINE5(lockstat, , , rw__block, "struct rwlock *", "uint64_t", "int",
          "int", "int");
      SDT_PROBE_DEFINE2(lockstat, , , rw__spin, "struct rwlock *", "uint64_t");
      SDT_PROBE_DEFINE1(lockstat, , , rw__upgrade, "struct rwlock *");
      SDT_PROBE_DEFINE1(lockstat, , , rw__downgrade, "struct rwlock *");
      
      SDT_PROBE_DEFINE2(lockstat, , , sx__acquire, "struct sx *", "int");
      SDT_PROBE_DEFINE2(lockstat, , , sx__release, "struct sx *", "int");
      SDT_PROBE_DEFINE5(lockstat, , , sx__block, "struct sx *", "uint64_t", "int",
          "int", "int");
      SDT_PROBE_DEFINE2(lockstat, , , sx__spin, "struct sx *", "uint64_t");
      SDT_PROBE_DEFINE1(lockstat, , , sx__upgrade, "struct sx *");
      SDT_PROBE_DEFINE1(lockstat, , , sx__downgrade, "struct sx *");
      
      SDT_PROBE_DEFINE2(lockstat, , , lockmgr__acquire, "struct lock *", "int");
      SDT_PROBE_DEFINE2(lockstat, , , lockmgr__release, "struct lock *", "int");
      SDT_PROBE_DEFINE2(lockstat, , , lockmgr__disown, "struct lock *", "int");
      SDT_PROBE_DEFINE5(lockstat, , , lockmgr__block, "struct lock *", "uint64_t",
          "int", "int", "int");
      SDT_PROBE_DEFINE1(lockstat, , , lockmgr__upgrade, "struct lock *");
      SDT_PROBE_DEFINE1(lockstat, , , lockmgr__downgrade, "struct lock *");
      
      SDT_PROBE_DEFINE2(lockstat, , , thread__spin, "struct mtx *", "uint64_t");
      
      volatile bool __read_frequently lockstat_enabled;
      
      uint64_t 
      lockstat_nsecs(struct lock_object *lo)
 2265 {
              struct bintime bt;
              uint64_t ns;
      
 2265         if (!lockstat_enabled)
                      return (0);
              if ((lo->lo_flags & LO_NOPROFILE) != 0)
                      return (0);
      
              binuptime(&bt);
              ns = bt.sec * (uint64_t)1000000000;
              ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
              return (ns);
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice unmodified, this list of conditions, and the following
       *    disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/counter.h>
      #include <sys/kernel.h>
      #include <sys/limits.h>
      #include <sys/proc.h>
      #include <sys/smp.h>
      #include <sys/smr.h>
      #include <sys/sysctl.h>
      
      #include <vm/uma.h>
      
      /*
       * Global Unbounded Sequences (GUS)
       *
       * This is a novel safe memory reclamation technique inspired by
       * epoch based reclamation from Samy Al Bahra's concurrency kit which
       * in turn was based on work described in:
       *   Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
       *   of Cambridge Computing Laboratory.
       * And shares some similarities with:
       *   Wang, Stamler, Parmer. 2016 Parallel Sections: Scaling System-Level
       *   Data-Structures
       *
       * This is not an implementation of hazard pointers or related
       * techniques.  The term safe memory reclamation is used as a
       * generic descriptor for algorithms that defer frees to avoid
       * use-after-free errors with lockless datastructures or as
       * a mechanism to detect quiescence for writer synchronization.
       *
       * The basic approach is to maintain a monotonic write sequence
       * number that is updated on some application defined granularity.
       * Readers record the most recent write sequence number they have
       * observed.  A shared read sequence number records the lowest
       * sequence number observed by any reader as of the last poll.  Any
       * write older than this value has been observed by all readers
       * and memory can be reclaimed.  Like Epoch we also detect idle
       * readers by storing an invalid sequence number in the per-cpu
       * state when the read section exits.  Like Parsec we establish
       * a global write clock that is used to mark memory on free.
       *
       * The write and read sequence numbers can be thought of as a two
       * handed clock with readers always advancing towards writers.  GUS 
       * maintains the invariant that all readers can safely access memory
       * that was visible at the time they loaded their copy of the sequence
       * number.  Periodically the read sequence or hand is polled and
       * advanced as far towards the write sequence as active readers allow.
       * Memory which was freed between the old and new global read sequence
       * number can now be reclaimed.  When the system is idle the two hands
       * meet and no deferred memory is outstanding.  Readers never advance
       * any sequence number, they only observe them.  The shared read
       * sequence number is consequently never higher than the write sequence.
       * A stored sequence number that falls outside of this range has expired
       * and needs no scan to reclaim.
       *
       * A notable distinction between GUS and Epoch, qsbr, rcu, etc. is
       * that advancing the sequence number is decoupled from detecting its
       * observation.  That is to say, the delta between read and write
       * sequence numbers is not bound.  This can be thought of as a more
       * generalized form of epoch which requires them at most one step
       * apart.  This results in a more granular assignment of sequence
       * numbers even as read latencies prohibit all or some expiration.
       * It also allows writers to advance the sequence number and save the
       * poll for expiration until a later time when it is likely to
       * complete without waiting.  The batch granularity and free-to-use
       * latency is dynamic and can be significantly smaller than in more
       * strict systems.
       *
       * This mechanism is primarily intended to be used in coordination with
       * UMA.  By integrating with the allocator we avoid all of the callout
       * queue machinery and are provided with an efficient way to batch
       * sequence advancement and waiting.  The allocator accumulates a full
       * per-cpu cache of memory before advancing the sequence.  It then
       * delays waiting for this sequence to expire until the memory is
       * selected for reuse.  In this way we only increment the sequence
       * value once for n=cache-size frees and the waits are done long
       * after the sequence has been expired so they need only be verified
       * to account for pathological conditions and to advance the read
       * sequence.  Tying the sequence number to the bucket size has the
       * nice property that as the zone gets busier the buckets get larger
       * and the sequence writes become fewer.  If the coherency of advancing
       * the write sequence number becomes too costly we can advance
       * it for every N buckets in exchange for higher free-to-use
       * latency and consequently higher memory consumption.
       *
       * If the read overhead of accessing the shared cacheline becomes
       * especially burdensome an invariant TSC could be used in place of the
       * sequence.  The algorithm would then only need to maintain the minimum
       * observed tsc.  This would trade potential cache synchronization
       * overhead for local serialization and cpu timestamp overhead.
       */
      
      /*
       * A simplified diagram:
       *
       * 0                                                          UINT_MAX
       * | -------------------- sequence number space -------------------- |
       *              ^ rd seq                            ^ wr seq
       *              | ----- valid sequence numbers ---- |
       *                ^cpuA  ^cpuC
       * | -- free -- | --------- deferred frees -------- | ---- free ---- |
       *
       * 
       * In this example cpuA has the lowest sequence number and poll can
       * advance rd seq.  cpuB is not running and is considered to observe
       * wr seq.
       *
       * Freed memory that is tagged with a sequence number between rd seq and
       * wr seq can not be safely reclaimed because cpuA may hold a reference to
       * it.  Any other memory is guaranteed to be unreferenced.
       *
       * Any writer is free to advance wr seq at any time however it may busy
       * poll in pathological cases.
       */
      
      static uma_zone_t smr_shared_zone;
      static uma_zone_t smr_zone;
      
      #ifndef INVARIANTS
      #define        SMR_SEQ_INIT        1                /* All valid sequence numbers are odd. */
      #define        SMR_SEQ_INCR        2
      
      /*
       * SMR_SEQ_MAX_DELTA is the maximum distance allowed between rd_seq and
       * wr_seq.  For the modular arithmetic to work a value of UNIT_MAX / 2
       * would be possible but it is checked after we increment the wr_seq so
       * a safety margin is left to prevent overflow.
       *
       * We will block until SMR_SEQ_MAX_ADVANCE sequence numbers have progressed
       * to prevent integer wrapping.  See smr_advance() for more details.
       */
      #define        SMR_SEQ_MAX_DELTA        (UINT_MAX / 4)
      #define        SMR_SEQ_MAX_ADVANCE        (SMR_SEQ_MAX_DELTA - 1024)
      #else
      /* We want to test the wrapping feature in invariants kernels. */
      #define        SMR_SEQ_INCR        (UINT_MAX / 10000)
      #define        SMR_SEQ_INIT        (UINT_MAX - 100000)
      /* Force extra polls to test the integer overflow detection. */
      #define        SMR_SEQ_MAX_DELTA        (SMR_SEQ_INCR * 32)
      #define        SMR_SEQ_MAX_ADVANCE        SMR_SEQ_MAX_DELTA / 2
      #endif
      
      /*
       * The grace period for lazy (tick based) SMR.
       *
       * Hardclock is responsible for advancing ticks on a single CPU while every
       * CPU receives a regular clock interrupt.  The clock interrupts are flushing
       * the store buffers and any speculative loads that may violate our invariants.
       * Because these interrupts are not synchronized we must wait one additional
       * tick in the future to be certain that all processors have had their state
       * synchronized by an interrupt.
       *
       * This assumes that the clock interrupt will only be delayed by other causes
       * that will flush the store buffer or prevent access to the section protected
       * data.  For example, an idle processor, or an system management interrupt,
       * or a vm exit.
       */
      #define        SMR_LAZY_GRACE                2
      #define        SMR_LAZY_INCR                (SMR_LAZY_GRACE * SMR_SEQ_INCR)
      
      /*
       * The maximum sequence number ahead of wr_seq that may still be valid.  The
       * sequence may not be advanced on write for lazy or deferred SMRs.  In this
       * case poll needs to attempt to forward the sequence number if the goal is
       * within wr_seq + SMR_SEQ_ADVANCE.
       */
      #define        SMR_SEQ_ADVANCE                SMR_LAZY_INCR
      
      static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
          "SMR Stats");
      static COUNTER_U64_DEFINE_EARLY(advance);
      SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, "");
      static COUNTER_U64_DEFINE_EARLY(advance_wait);
      SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, "");
      static COUNTER_U64_DEFINE_EARLY(poll);
      SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, "");
      static COUNTER_U64_DEFINE_EARLY(poll_scan);
      SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, "");
      static COUNTER_U64_DEFINE_EARLY(poll_fail);
      SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, "");
      
      /*
       * Advance a lazy write sequence number.  These move forward at the rate of
       * ticks.  Grace is SMR_LAZY_INCR (2 ticks) in the future.
       *
       * This returns the goal write sequence number.
       */
      static smr_seq_t
      smr_lazy_advance(smr_t smr, smr_shared_t s)
      {
              union s_wr s_wr, old;
              int t, d;
      
              CRITICAL_ASSERT(curthread);
      
              /*
               * Load the stored ticks value before the current one.  This way the
               * current value can only be the same or larger.
               */
              old._pair = s_wr._pair = atomic_load_acq_64(&s->s_wr._pair);
              t = ticks;
      
              /*
               * The most probable condition that the update already took place.
               */
              d = t - s_wr.ticks;
              if (__predict_true(d == 0))
                      goto out;
              /* Cap the rate of advancement and handle long idle periods. */
              if (d > SMR_LAZY_GRACE || d < 0)
                      d = SMR_LAZY_GRACE;
              s_wr.ticks = t;
              s_wr.seq += d * SMR_SEQ_INCR;
      
              /*
               * This can only fail if another thread races to call advance().
               * Strong cmpset semantics mean we are guaranteed that the update
               * happened.
               */
              atomic_cmpset_64(&s->s_wr._pair, old._pair, s_wr._pair);
      out:
              return (s_wr.seq + SMR_LAZY_INCR);
      }
      
      /*
       * Increment the shared write sequence by 2.  Since it is initialized
       * to 1 this means the only valid values are odd and an observed value
       * of 0 in a particular CPU means it is not currently in a read section.
       */
      static smr_seq_t
      smr_shared_advance(smr_shared_t s)
      {
      
              return (atomic_fetchadd_int(&s->s_wr.seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);
      }
      
      /*
       * Advance the write sequence number for a normal smr section.  If the
       * write sequence is too far behind the read sequence we have to poll
       * to advance rd_seq and prevent undetectable wraps.
       */
      static smr_seq_t
      smr_default_advance(smr_t smr, smr_shared_t s)
  213 {
              smr_seq_t goal, s_rd_seq;
      
              CRITICAL_ASSERT(curthread);
              KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
                  ("smr_default_advance: called with lazy smr."));
      
              /*
               * Load the current read seq before incrementing the goal so
               * we are guaranteed it is always < goal.
               */
              s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
              goal = smr_shared_advance(s);
      
              /*
               * Force a synchronization here if the goal is getting too
               * far ahead of the read sequence number.  This keeps the
               * wrap detecting arithmetic working in pathological cases.
               */
  213         if (SMR_SEQ_DELTA(goal, s_rd_seq) >= SMR_SEQ_MAX_DELTA) {
                      counter_u64_add(advance_wait, 1);
                      smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
              }
              counter_u64_add(advance, 1);
      
              return (goal);
      }
      
      /*
       * Deferred SMRs conditionally update s_wr_seq based on an
       * cpu local interval count.
       */
      static smr_seq_t
      smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self)
      {
      
              if (++self->c_deferred < self->c_limit)
                      return (smr_shared_current(s) + SMR_SEQ_INCR);
              self->c_deferred = 0;
              return (smr_default_advance(smr, s));
      }
      
      /*
       * Advance the write sequence and return the value for use as the
       * wait goal.  This guarantees that any changes made by the calling
       * thread prior to this call will be visible to all threads after
       * rd_seq meets or exceeds the return value.
       *
       * This function may busy loop if the readers are roughly 1 billion
       * sequence numbers behind the writers.
       *
       * Lazy SMRs will not busy loop and the wrap happens every 25 days
       * at 1khz and 60 hours at 10khz.  Readers can block for no longer
       * than half of this for SMR_SEQ_ macros to continue working.
       */
      smr_seq_t
      smr_advance(smr_t smr)
  213 {
              smr_t self;
              smr_shared_t s;
              smr_seq_t goal;
              int flags;
      
              /*
               * It is illegal to enter while in an smr section.
               */
  213         SMR_ASSERT_NOT_ENTERED(smr);
      
              /*
               * Modifications not done in a smr section need to be visible
               * before advancing the seq.
               */
              atomic_thread_fence_rel();
      
              critical_enter();
              /* Try to touch the line once. */
              self = zpcpu_get(smr);
              s = self->c_shared;
              flags = self->c_flags;
              goal = SMR_SEQ_INVALID;
              if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0)
  213                 goal = smr_default_advance(smr, s);
              else if ((flags & SMR_LAZY) != 0)
                      goal = smr_lazy_advance(smr, s);
              else if ((flags & SMR_DEFERRED) != 0)
                      goal = smr_deferred_advance(smr, s, self);
              critical_exit();
      
              return (goal);
      }
      
      /*
       * Poll to determine the currently observed sequence number on a cpu
       * and spinwait if the 'wait' argument is true.
       */
      static smr_seq_t
      smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait)
      {
              smr_seq_t c_seq;
      
              c_seq = SMR_SEQ_INVALID;
              for (;;) {
                      c_seq = atomic_load_int(&c->c_seq);
  206                 if (c_seq == SMR_SEQ_INVALID)
                              break;
      
                      /*
                       * There is a race described in smr.h:smr_enter that
                       * can lead to a stale seq value but not stale data
                       * access.  If we find a value out of range here we
                       * pin it to the current min to prevent it from
                       * advancing until that stale section has expired.
                       *
                       * The race is created when a cpu loads the s_wr_seq
                       * value in a local register and then another thread
                       * advances s_wr_seq and calls smr_poll() which will
                       * oberve no value yet in c_seq and advance s_rd_seq
                       * up to s_wr_seq which is beyond the register
                       * cached value.  This is only likely to happen on
                       * hypervisor or with a system management interrupt.
                       */
                      if (SMR_SEQ_LT(c_seq, s_rd_seq))
                              c_seq = s_rd_seq;
      
                      /*
                       * If the sequence number meets the goal we are done
                       * with this cpu.
                       */
                      if (SMR_SEQ_LEQ(goal, c_seq))
                              break;
      
                      if (!wait)
                              break;
                      cpu_spinwait();
              }
      
              return (c_seq);
      }
      
      /*
       * Loop until all cores have observed the goal sequence or have
       * gone inactive.  Returns the oldest sequence currently active;
       *
       * This function assumes a snapshot of sequence values has
       * been obtained and validated by smr_poll().
       */
      static smr_seq_t
      smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq,
          smr_seq_t s_wr_seq, smr_seq_t goal, bool wait)
      {
              smr_seq_t rd_seq, c_seq;
              int i;
      
              CRITICAL_ASSERT(curthread);
              counter_u64_add_protected(poll_scan, 1);
      
              /*
               * The read sequence can be no larger than the write sequence at
               * the start of the poll.
               */
              rd_seq = s_wr_seq;
  206         CPU_FOREACH(i) {
                      /*
                       * Query the active sequence on this cpu.  If we're not
                       * waiting and we don't meet the goal we will still scan
                       * the rest of the cpus to update s_rd_seq before returning
                       * failure.
                       */
                      c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal,
                          wait);
      
                      /*
                       * Limit the minimum observed rd_seq whether we met the goal
                       * or not.
                       */
                      if (c_seq != SMR_SEQ_INVALID)
                              rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
              }
      
              /*
               * Advance the rd_seq as long as we observed a more recent value.
               */
              s_rd_seq = atomic_load_int(&s->s_rd_seq);
              if (SMR_SEQ_GT(rd_seq, s_rd_seq)) {
  206                 atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);
                      s_rd_seq = rd_seq;
              }
      
              return (s_rd_seq);
      }
      
      /*
       * Poll to determine whether all readers have observed the 'goal' write
       * sequence number.
       *
       * If wait is true this will spin until the goal is met.
       *
       * This routine will updated the minimum observed read sequence number in
       * s_rd_seq if it does a scan.  It may not do a scan if another call has
       * advanced s_rd_seq beyond the callers goal already.
       *
       * Returns true if the goal is met and false if not.
       */
      bool
      smr_poll(smr_t smr, smr_seq_t goal, bool wait)
 2316 {
              smr_shared_t s;
              smr_t self;
              smr_seq_t s_wr_seq, s_rd_seq;
              smr_delta_t delta;
              int flags;
              bool success;
      
              /*
               * It is illegal to enter while in an smr section.
               */
    1         KASSERT(!wait || !SMR_ENTERED(smr),
                  ("smr_poll: Blocking not allowed in a SMR section."));
    1         KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
                  ("smr_poll: Blocking not allowed on lazy smrs."));
      
              /*
               * Use a critical section so that we can avoid ABA races
               * caused by long preemption sleeps.
               */
              success = true;
              critical_enter();
              /* Attempt to load from self only once. */
 2316         self = zpcpu_get(smr);
              s = self->c_shared;
              flags = self->c_flags;
              counter_u64_add_protected(poll, 1);
      
              /*
               * Conditionally advance the lazy write clock on any writer
               * activity.
               */
 2316         if ((flags & SMR_LAZY) != 0)
                      smr_lazy_advance(smr, s);
      
              /*
               * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
               * observe an updated read sequence that is larger than write.
               */
              s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
      
              /*
               * If we have already observed the sequence number we can immediately
               * return success.  Most polls should meet this criterion.
               */
 2315         if (SMR_SEQ_LEQ(goal, s_rd_seq))
                      goto out;
      
              /*
               * wr_seq must be loaded prior to any c_seq value so that a
               * stale c_seq can only reference time after this wr_seq.
               */
              s_wr_seq = atomic_load_acq_int(&s->s_wr.seq);
      
              /*
               * This is the distance from s_wr_seq to goal.  Positive values
               * are in the future.
               */
              delta = SMR_SEQ_DELTA(goal, s_wr_seq);
      
              /*
               * Detect a stale wr_seq.
               *
               * This goal may have come from a deferred advance or a lazy
               * smr.  If we are not blocking we can not succeed but the
               * sequence number is valid.
               */
              if (delta > 0 && delta <= SMR_SEQ_ADVANCE &&
                  (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) {
                      if (!wait) {
                              success = false;
                              goto out;
                      }
                      /* LAZY is always !wait. */
                      s_wr_seq = smr_shared_advance(s);
                      delta = 0;
              }
      
              /*
               * Detect an invalid goal.
               *
               * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
               * it to be valid.  If it is not then the caller held on to it and
               * the integer wrapped.  If we wrapped back within range the caller
               * will harmlessly scan.
               */
  207         if (delta > 0)
                      goto out;
      
              /* Determine the lowest visible sequence number. */
  206         s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait);
              success = SMR_SEQ_LEQ(goal, s_rd_seq);
      out:
  206         if (!success)
                      counter_u64_add_protected(poll_fail, 1);
              critical_exit();
      
              /*
               * Serialize with smr_advance()/smr_exit().  The caller is now free
               * to modify memory as expected.
               */
              atomic_thread_fence_acq();
      
              return (success);
      }
      
      smr_t
      smr_create(const char *name, int limit, int flags)
      {
              smr_t smr, c;
              smr_shared_t s;
              int i;
      
              s = uma_zalloc(smr_shared_zone, M_WAITOK);
              smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);
      
              s->s_name = name;
              s->s_rd_seq = s->s_wr.seq = SMR_SEQ_INIT;
              s->s_wr.ticks = ticks;
      
              /* Initialize all CPUS, not just those running. */
              for (i = 0; i <= mp_maxid; i++) {
                      c = zpcpu_get_cpu(smr, i);
                      c->c_seq = SMR_SEQ_INVALID;
                      c->c_shared = s;
                      c->c_deferred = 0;
                      c->c_limit = limit;
                      c->c_flags = flags;
              }
              atomic_thread_fence_seq_cst();
      
              return (smr);
      }
      
      void
      smr_destroy(smr_t smr)
      {
      
              smr_synchronize(smr);
              uma_zfree(smr_shared_zone, smr->c_shared);
              uma_zfree_pcpu(smr_zone, smr);
      }
      
      /*
       * Initialize the UMA slab zone.
       */
      void
      smr_init(void)
      {
      
              smr_shared_zone = uma_zcreate("SMR SHARED", sizeof(struct smr_shared),
                  NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, 0);
              smr_zone = uma_zcreate("SMR CPU", sizeof(struct smr),
                  NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, UMA_ZONE_PCPU);
      }
      /*-
       * CAM request queue management functions.
       *
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 1997 Justin T. Gibbs.
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions, and the following disclaimer,
       *    without modification, immediately at the beginning of the file.
       * 2. The name of the author may not be used to endorse or promote products
       *    derived from this software without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
       * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/types.h>
      #include <sys/malloc.h>
      #include <sys/kernel.h>
      
      #include <cam/cam.h>
      #include <cam/cam_ccb.h>
      #include <cam/cam_queue.h>
      #include <cam/cam_debug.h>
      
      static MALLOC_DEFINE(M_CAMQ, "CAM queue", "CAM queue buffers");
      static MALLOC_DEFINE(M_CAMDEVQ, "CAM dev queue", "CAM dev queue buffers");
      static MALLOC_DEFINE(M_CAMCCBQ, "CAM ccb queue", "CAM ccb queue buffers");
      
      static __inline int
                      queue_cmp(cam_pinfo **queue_array, int i, int j);
      static __inline void
                      swap(cam_pinfo **queue_array, int i, int j);
      static void        heap_up(cam_pinfo **queue_array, int new_index);
      static void        heap_down(cam_pinfo **queue_array, int index,
                                int last_index);
      
      int
      camq_init(struct camq *camq, int size)
      {
              bzero(camq, sizeof(*camq));
              camq->array_size = size;
              if (camq->array_size != 0) {
                      camq->queue_array = (cam_pinfo**)malloc(size*sizeof(cam_pinfo*),
                                                              M_CAMQ, M_NOWAIT);
                      if (camq->queue_array == NULL) {
                              printf("camq_init: - cannot malloc array!\n");
                              return (1);
                      }
                      /*
                       * Heap algorithms like everything numbered from 1, so
                       * offset our pointer into the heap array by one element.
                       */
                      camq->queue_array--;
              }
              return (0);
      }
      
      /*
       * Free a camq structure.  This should only be called if a controller
       * driver failes somehow during its attach routine or is unloaded and has
       * obtained a camq structure.  The XPT should ensure that the queue
       * is empty before calling this routine.
       */
      void
      camq_fini(struct camq *queue)
      {
              if (queue->queue_array != NULL) {
                      /*
                       * Heap algorithms like everything numbered from 1, so
                       * our pointer into the heap array is offset by one element.
                       */
                      queue->queue_array++;
                      free(queue->queue_array, M_CAMQ);
              }
      }
      
      u_int32_t
      camq_resize(struct camq *queue, int new_size)
      {
              cam_pinfo **new_array;
      
              KASSERT(new_size >= queue->entries, ("camq_resize: "
                  "New queue size can't accommodate queued entries (%d < %d).",
                  new_size, queue->entries));
              new_array = (cam_pinfo **)malloc(new_size * sizeof(cam_pinfo *),
                                               M_CAMQ, M_NOWAIT);
              if (new_array == NULL) {
                      /* Couldn't satisfy request */
                      return (CAM_RESRC_UNAVAIL);
              }
              /*
               * Heap algorithms like everything numbered from 1, so
               * remember that our pointer into the heap array is offset
               * by one element.
               */
              if (queue->queue_array != NULL) {
                      queue->queue_array++;
                      bcopy(queue->queue_array, new_array,
                            queue->entries * sizeof(cam_pinfo *));
                      free(queue->queue_array, M_CAMQ);
              }
              queue->queue_array = new_array-1;
              queue->array_size = new_size;
              return (CAM_REQ_CMP);
      }
      
      /*
       * camq_insert: Given an array of cam_pinfo* elememnts with
       * the Heap(1, num_elements) property and array_size - num_elements >= 1,
       * output Heap(1, num_elements+1) including new_entry in the array.
       */
      void
      camq_insert(struct camq *queue, cam_pinfo *new_entry)
 3719 {
      
              KASSERT(queue->entries < queue->array_size,
                  ("camq_insert: Attempt to insert into a full queue (%d >= %d)",
                  queue->entries, queue->array_size));
              queue->entries++;
              queue->queue_array[queue->entries] = new_entry;
              new_entry->index = queue->entries;
              if (queue->entries != 0)
                      heap_up(queue->queue_array, queue->entries);
      }
      
      /*
       * camq_remove:  Given an array of cam_pinfo* elevements with the
       * Heap(1, num_elements) property and an index such that 1 <= index <=
       * num_elements, remove that entry and restore the Heap(1, num_elements-1)
       * property.
       */
      cam_pinfo *
      camq_remove(struct camq *queue, int index)
 3719 {
              cam_pinfo *removed_entry;
      
              if (index <= 0 || index > queue->entries)
                      panic("%s: Attempt to remove out-of-bounds index %d "
                          "from queue %p of size %d", __func__, index, queue,
                          queue->entries);
      
              removed_entry = queue->queue_array[index];
 3719         if (queue->entries != index) {
                      queue->queue_array[index] = queue->queue_array[queue->entries];
                      queue->queue_array[index]->index = index;
                      heap_down(queue->queue_array, index, queue->entries - 1);
              }
              removed_entry->index = CAM_UNQUEUED_INDEX;
              queue->entries--;
              return (removed_entry);
      }
      
      /*
       * camq_change_priority:  Given an array of cam_pinfo* elements with the
       * Heap(1, num_entries) property, an index such that 1 <= index <= num_elements,
       * and a new priority for the element at index, change the priority of
       * element index and restore the Heap(0, num_elements) property.
       */
      void
      camq_change_priority(struct camq *queue, int index, u_int32_t new_priority)
      {
              if (new_priority > queue->queue_array[index]->priority) {
                      queue->queue_array[index]->priority = new_priority;
                      heap_down(queue->queue_array, index, queue->entries);
              } else {
                      /* new_priority <= old_priority */
                      queue->queue_array[index]->priority = new_priority;
                      heap_up(queue->queue_array, index);
              }
      }
      
      struct cam_devq *
      cam_devq_alloc(int devices, int openings)
      {
              struct cam_devq *devq;
      
              devq = (struct cam_devq *)malloc(sizeof(*devq), M_CAMDEVQ, M_NOWAIT);
              if (devq == NULL) {
                      printf("cam_devq_alloc: - cannot malloc!\n");
                      return (NULL);
              }
              if (cam_devq_init(devq, devices, openings) != 0) {
                      free(devq, M_CAMDEVQ);
                      return (NULL);
              }
              return (devq);
      }
      
      int
      cam_devq_init(struct cam_devq *devq, int devices, int openings)
      {
      
              bzero(devq, sizeof(*devq));
              mtx_init(&devq->send_mtx, "CAM queue lock", NULL, MTX_DEF);
              if (camq_init(&devq->send_queue, devices) != 0)
                      return (1);
              devq->send_openings = openings;
              devq->send_active = 0;
              return (0);
      }
      
      void
      cam_devq_free(struct cam_devq *devq)
      {
      
              camq_fini(&devq->send_queue);
              mtx_destroy(&devq->send_mtx);
              free(devq, M_CAMDEVQ);
      }
      
      u_int32_t
      cam_devq_resize(struct cam_devq *camq, int devices)
      {
              u_int32_t retval;
      
              retval = camq_resize(&camq->send_queue, devices);
              return (retval);
      }
      
      struct cam_ccbq *
      cam_ccbq_alloc(int openings)
      {
              struct cam_ccbq *ccbq;
      
              ccbq = (struct cam_ccbq *)malloc(sizeof(*ccbq), M_CAMCCBQ, M_NOWAIT);
              if (ccbq == NULL) {
                      printf("cam_ccbq_alloc: - cannot malloc!\n");
                      return (NULL);
              }
              if (cam_ccbq_init(ccbq, openings) != 0) {
                      free(ccbq, M_CAMCCBQ);
                      return (NULL);                
              }
              
              return (ccbq);
      }
      
      void
      cam_ccbq_free(struct cam_ccbq *ccbq)
      {
              if (ccbq) {
                      cam_ccbq_fini(ccbq);
                      free(ccbq, M_CAMCCBQ);
              }
      }
      
      u_int32_t
      cam_ccbq_resize(struct cam_ccbq *ccbq, int new_size)
      {
              int delta;
      
              delta = new_size - (ccbq->dev_active + ccbq->dev_openings);
              ccbq->total_openings += delta;
              ccbq->dev_openings += delta;
      
              new_size = imax(64, 1 << fls(new_size + new_size / 2));
              if (new_size > ccbq->queue.array_size)
                      return (camq_resize(&ccbq->queue, new_size));
              else
                      return (CAM_REQ_CMP);
      }
      
      int
      cam_ccbq_init(struct cam_ccbq *ccbq, int openings)
      {
              bzero(ccbq, sizeof(*ccbq));
              if (camq_init(&ccbq->queue,
                  imax(64, 1 << fls(openings + openings / 2))) != 0)
                      return (1);
              ccbq->total_openings = openings;
              ccbq->dev_openings = openings;
              return (0);
      }
      
      void
      cam_ccbq_fini(struct cam_ccbq *ccbq)
      {
      
              camq_fini(&ccbq->queue);
      }
      
      /*
       * Heap routines for manipulating CAM queues.
       */
      /*
       * queue_cmp: Given an array of cam_pinfo* elements and indexes i
       * and j, return less than 0, 0, or greater than 0 if i is less than,
       * equal too, or greater than j respectively.
       */
      static __inline int
      queue_cmp(cam_pinfo **queue_array, int i, int j)
      {
              if (queue_array[i]->priority == queue_array[j]->priority)
                      return (  queue_array[i]->generation
                              - queue_array[j]->generation );
              else
                      return (  queue_array[i]->priority
                              - queue_array[j]->priority );
      }
      
      /*
       * swap: Given an array of cam_pinfo* elements and indexes i and j,
       * exchange elements i and j.
       */
      static __inline void
      swap(cam_pinfo **queue_array, int i, int j)
      {
              cam_pinfo *temp_qentry;
      
              temp_qentry = queue_array[j];
              queue_array[j] = queue_array[i];
              queue_array[i] = temp_qentry;
              queue_array[j]->index = j;
              queue_array[i]->index = i;
      }
      
      /*
       * heap_up:  Given an array of cam_pinfo* elements with the
       * Heap(1, new_index-1) property and a new element in location
       * new_index, output Heap(1, new_index).
       */
      static void
      heap_up(cam_pinfo **queue_array, int new_index)
      {
              int child;
              int parent;
      
              child = new_index;
      
 3719         while (child != 1) {
      
                      parent = child >> 1;
                      if (queue_cmp(queue_array, parent, child) <= 0)
                              break;
                      swap(queue_array, parent, child);
                      child = parent;
              }
      }
      
      /*
       * heap_down:  Given an array of cam_pinfo* elements with the
       * Heap(index + 1, num_entries) property with index containing
       * an unsorted entry, output Heap(index, num_entries).
       */
      static void
      heap_down(cam_pinfo **queue_array, int index, int num_entries)
      {
              int child;
              int parent;
              
              parent = index;
              child = parent << 1;
              for (; child <= num_entries; child = parent << 1) {
      
                      if (child < num_entries) {
                              /* child+1 is the right child of parent */
                              if (queue_cmp(queue_array, child + 1, child) < 0)
                                      child++;
                      }
                      /* child is now the least child of parent */
                      if (queue_cmp(queue_array, parent, child) <= 0)
                              break;
                      swap(queue_array, child, parent);
                      parent = child;
              }
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       * $FreeBSD$
       */
      
      #ifndef __MACHINE_COUNTER_H__
      #define __MACHINE_COUNTER_H__
      
      #include <sys/pcpu.h>
      
      #define        EARLY_COUNTER        (void *)__offsetof(struct pcpu, pc_early_dummy_counter)
      
      #define        counter_enter()        do {} while (0)
      #define        counter_exit()        do {} while (0)
      
      #ifdef IN_SUBR_COUNTER_C
      static inline uint64_t
      counter_u64_read_one(counter_u64_t c, int cpu)
      {
      
              MPASS(c != EARLY_COUNTER);
   27         return (*zpcpu_get_cpu(c, cpu));
      }
      
      static inline uint64_t
      counter_u64_fetch_inline(uint64_t *c)
      {
              uint64_t r;
              int cpu;
      
              r = 0;
   27         CPU_FOREACH(cpu)
                      r += counter_u64_read_one(c, cpu);
      
              return (r);
      }
      
      static void
      counter_u64_zero_one_cpu(void *arg)
    3 {
              counter_u64_t c;
      
              c = arg;
              MPASS(c != EARLY_COUNTER);
    3         *(zpcpu_get(c)) = 0;
      }
      
      static inline void
      counter_u64_zero_inline(counter_u64_t c)
      {
      
              smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu,
                  smp_no_rendezvous_barrier, c);
      }
      #endif
      
      #define        counter_u64_add_protected(c, i)        counter_u64_add(c, i)
      
      static inline void
      counter_u64_add(counter_u64_t c, int64_t inc)
      {
      
              KASSERT(IS_BSP() || c != EARLY_COUNTER, ("EARLY_COUNTER used on AP"));
              zpcpu_add(c, inc);
      }
      
      #endif        /* ! __MACHINE_COUNTER_H__ */
      /*-
       * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. The name of the author may not be used to endorse or promote
       *    products derived from this software without specific prior written
       *    permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      /*
       * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
       * are the number of compression rounds and the number of finalization rounds.
       * A compression round is identical to a finalization round and this round
       * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
       * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
       *
       * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
       * by Jean-Philippe Aumasson and Daniel J. Bernstein,
       * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
       * https://131002.net/siphash/siphash.pdf
       * https://131002.net/siphash/
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/types.h>
      #include <sys/systm.h>
      #include <sys/libkern.h>
      #include <sys/endian.h>
      
      #include <crypto/siphash/siphash.h>
      
      static void        SipRounds(SIPHASH_CTX *ctx, int final);
      
      void
      SipHash_InitX(SIPHASH_CTX *ctx, int rc, int rf)
 3867 {
      
              ctx->v[0] = 0x736f6d6570736575ull;
              ctx->v[1] = 0x646f72616e646f6dull;
              ctx->v[2] = 0x6c7967656e657261ull;
              ctx->v[3] = 0x7465646279746573ull;
              ctx->buf.b64 = 0;
              ctx->bytes = 0;
              ctx->buflen = 0;
              ctx->rounds_compr = rc;
              ctx->rounds_final = rf;
              ctx->initialized = 1;
      }
      
      void
      SipHash_SetKey(SIPHASH_CTX *ctx, const uint8_t key[static SIPHASH_KEY_LENGTH])
 3867 {
              uint64_t k[2];
      
              KASSERT(ctx->v[0] == 0x736f6d6570736575ull &&
                  ctx->initialized == 1,
                  ("%s: context %p not properly initialized", __func__, ctx));
      
 3867         k[0] = le64dec(&key[0]);
              k[1] = le64dec(&key[8]);
      
              ctx->v[0] ^= k[0];
              ctx->v[1] ^= k[1];
              ctx->v[2] ^= k[0];
              ctx->v[3] ^= k[1];
      
              ctx->initialized = 2;
      }
      
      static size_t
      SipBuf(SIPHASH_CTX *ctx, const uint8_t **src, size_t len, int final)
      {
              size_t x = 0;
      
              KASSERT((!final && len > 0) || (final && len == 0),
                  ("%s: invalid parameters", __func__));
      
              if (!final) {
                      x = MIN(len, sizeof(ctx->buf.b64) - ctx->buflen);
                      bcopy(*src, &ctx->buf.b8[ctx->buflen], x);
                      ctx->buflen += x;
                      *src += x;
              } else
                      ctx->buf.b8[7] = (uint8_t)ctx->bytes;
      
 3867         if (ctx->buflen == 8 || final) {
                      ctx->v[3] ^= le64toh(ctx->buf.b64);
 3867                 SipRounds(ctx, 0);
                      ctx->v[0] ^= le64toh(ctx->buf.b64);
                      ctx->buf.b64 = 0;
                      ctx->buflen = 0;
              }
              return (x);
      }
      
      void
      SipHash_Update(SIPHASH_CTX *ctx, const void *src, size_t len)
 3867 {
              uint64_t m;
              const uint64_t *p;
              const uint8_t *s;
              size_t rem;
      
              KASSERT(ctx->initialized == 2,
                  ("%s: context %p not properly initialized", __func__, ctx));
      
              s = src;
              ctx->bytes += len;
      
              /*
               * Push length smaller than block size into buffer or
               * fill up the buffer if there is already something
               * in it.
               */
              if (ctx->buflen > 0 || len < 8)
                      len -= SipBuf(ctx, &s, len, 0);
 3867         if (len == 0)
                      return;
      
              rem = len & 0x7;
              len >>= 3;
      
              /* Optimze for 64bit aligned/unaligned access. */
              if (((uintptr_t)s & 0x7) == 0) {
                      for (p = (const uint64_t *)s; len > 0; len--, p++) {
                              m = le64toh(*p);
                              ctx->v[3] ^= m;
                              SipRounds(ctx, 0);
                              ctx->v[0] ^= m;
                      }
                      s = (const uint8_t *)p;
              } else {
 1849                 for (; len > 0; len--, s += 8) {
                              m = le64dec(s);
                              ctx->v[3] ^= m;
                              SipRounds(ctx, 0);
                              ctx->v[0] ^= m;
                      }
              }
      
              /* Push remainder into buffer. */
              if (rem > 0)
                      (void)SipBuf(ctx, &s, rem, 0);
      }
      
      void
      SipHash_Final(uint8_t dst[static SIPHASH_DIGEST_LENGTH], SIPHASH_CTX *ctx)
 3867 {
              uint64_t r;
      
              KASSERT(ctx->initialized == 2,
                  ("%s: context %p not properly initialized", __func__, ctx));
      
 3867         r = SipHash_End(ctx);
              le64enc(dst, r);
      }
      
      uint64_t
      SipHash_End(SIPHASH_CTX *ctx)
 3867 {
              uint64_t r;
      
              KASSERT(ctx->initialized == 2,
                  ("%s: context %p not properly initialized", __func__, ctx));
      
 3867         SipBuf(ctx, NULL, 0, 1);
              ctx->v[2] ^= 0xff;
 3867         SipRounds(ctx, 1);
              r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
      
              bzero(ctx, sizeof(*ctx));
              return (r);
      }
      
      uint64_t
      SipHashX(SIPHASH_CTX *ctx, int rc, int rf,
          const uint8_t key[static SIPHASH_KEY_LENGTH], const void *src, size_t len)
      {
      
              SipHash_InitX(ctx, rc, rf);
              SipHash_SetKey(ctx, key);
              SipHash_Update(ctx, src, len);
      
              return (SipHash_End(ctx));
      }
      
      #define SIP_ROTL(x, b)        (uint64_t)(((x) << (b)) | ( (x) >> (64 - (b))))
      
      static void
      SipRounds(SIPHASH_CTX *ctx, int final)
      {
              int rounds;
      
              if (!final)
                      rounds = ctx->rounds_compr;
              else
                      rounds = ctx->rounds_final;
      
 3867         while (rounds--) {
 3867                 ctx->v[0] += ctx->v[1];
                      ctx->v[2] += ctx->v[3];
                      ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
                      ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
      
                      ctx->v[1] ^= ctx->v[0];
                      ctx->v[3] ^= ctx->v[2];
                      ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
      
                      ctx->v[2] += ctx->v[1];
                      ctx->v[0] += ctx->v[3];
                      ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
                      ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
      
                      ctx->v[1] ^= ctx->v[2];
                      ctx->v[3] ^= ctx->v[0];
                      ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
              }
      }
      
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1989, 1991, 1993, 1994
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)ffs_vfsops.c        8.31 (Berkeley) 5/20/95
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_quota.h"
      #include "opt_ufs.h"
      #include "opt_ffs.h"
      #include "opt_ddb.h"
      
      #include <sys/param.h>
      #include <sys/gsb_crc32.h>
      #include <sys/systm.h>
      #include <sys/namei.h>
      #include <sys/priv.h>
      #include <sys/proc.h>
      #include <sys/taskqueue.h>
      #include <sys/kernel.h>
      #include <sys/ktr.h>
      #include <sys/vnode.h>
      #include <sys/mount.h>
      #include <sys/bio.h>
      #include <sys/buf.h>
      #include <sys/conf.h>
      #include <sys/fcntl.h>
      #include <sys/ioccom.h>
      #include <sys/malloc.h>
      #include <sys/mutex.h>
      #include <sys/rwlock.h>
      #include <sys/vmmeter.h>
      
      #include <security/mac/mac_framework.h>
      
      #include <ufs/ufs/dir.h>
      #include <ufs/ufs/extattr.h>
      #include <ufs/ufs/gjournal.h>
      #include <ufs/ufs/quota.h>
      #include <ufs/ufs/ufsmount.h>
      #include <ufs/ufs/inode.h>
      #include <ufs/ufs/ufs_extern.h>
      
      #include <ufs/ffs/fs.h>
      #include <ufs/ffs/ffs_extern.h>
      
      #include <vm/vm.h>
      #include <vm/uma.h>
      #include <vm/vm_page.h>
      
      #include <geom/geom.h>
      #include <geom/geom_vfs.h>
      
      #include <ddb/ddb.h>
      
      static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
      
      static int        ffs_mountfs(struct vnode *, struct mount *, struct thread *);
      static void        ffs_oldfscompat_read(struct fs *, struct ufsmount *,
                          ufs2_daddr_t);
      static void        ffs_ifree(struct ufsmount *ump, struct inode *ip);
      static int        ffs_sync_lazy(struct mount *mp);
      static int        ffs_use_bread(void *devfd, off_t loc, void **bufp, int size);
      static int        ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size);
      
      static vfs_init_t ffs_init;
      static vfs_uninit_t ffs_uninit;
      static vfs_extattrctl_t ffs_extattrctl;
      static vfs_cmount_t ffs_cmount;
      static vfs_unmount_t ffs_unmount;
      static vfs_mount_t ffs_mount;
      static vfs_statfs_t ffs_statfs;
      static vfs_fhtovp_t ffs_fhtovp;
      static vfs_sync_t ffs_sync;
      
      static struct vfsops ufs_vfsops = {
              .vfs_extattrctl =        ffs_extattrctl,
              .vfs_fhtovp =                ffs_fhtovp,
              .vfs_init =                ffs_init,
              .vfs_mount =                ffs_mount,
              .vfs_cmount =                ffs_cmount,
              .vfs_quotactl =                ufs_quotactl,
              .vfs_root =                vfs_cache_root,
              .vfs_cachedroot =        ufs_root,
              .vfs_statfs =                ffs_statfs,
              .vfs_sync =                ffs_sync,
              .vfs_uninit =                ffs_uninit,
              .vfs_unmount =                ffs_unmount,
              .vfs_vget =                ffs_vget,
              .vfs_susp_clean =        process_deferred_inactive,
      };
      
      VFS_SET(ufs_vfsops, ufs, 0);
      MODULE_VERSION(ufs, 1);
      
      static b_strategy_t ffs_geom_strategy;
      static b_write_t ffs_bufwrite;
      
      static struct buf_ops ffs_ops = {
              .bop_name =        "FFS",
              .bop_write =        ffs_bufwrite,
              .bop_strategy =        ffs_geom_strategy,
              .bop_sync =        bufsync,
      #ifdef NO_FFS_SNAPSHOT
              .bop_bdflush =        bufbdflush,
      #else
              .bop_bdflush =        ffs_bdflush,
      #endif
      };
      
      /*
       * Note that userquota and groupquota options are not currently used
       * by UFS/FFS code and generally mount(8) does not pass those options
       * from userland, but they can be passed by loader(8) via
       * vfs.root.mountfrom.options.
       */
      static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
          "noclusterw", "noexec", "export", "force", "from", "groupquota",
          "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir",
          "nosymfollow", "sync", "union", "userquota", "untrusted", NULL };
      
      static int
      ffs_mount(struct mount *mp)
      {
              struct vnode *devvp, *odevvp;
              struct thread *td;
              struct ufsmount *ump = NULL;
              struct fs *fs;
              pid_t fsckpid = 0;
              int error, error1, flags;
              uint64_t mntorflags, saved_mnt_flag;
              accmode_t accmode;
              struct nameidata ndp;
              char *fspec;
      
              td = curthread;
              if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
                      return (EINVAL);
              if (uma_inode == NULL) {
                      uma_inode = uma_zcreate("FFS inode",
                          sizeof(struct inode), NULL, NULL, NULL, NULL,
                          UMA_ALIGN_PTR, 0);
                      uma_ufs1 = uma_zcreate("FFS1 dinode",
                          sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
                          UMA_ALIGN_PTR, 0);
                      uma_ufs2 = uma_zcreate("FFS2 dinode",
                          sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
                          UMA_ALIGN_PTR, 0);
              }
      
              vfs_deleteopt(mp->mnt_optnew, "groupquota");
              vfs_deleteopt(mp->mnt_optnew, "userquota");
      
              fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
              if (error)
                      return (error);
      
              mntorflags = 0;
              if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0)
                      mntorflags |= MNT_UNTRUSTED;
      
              if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
                      mntorflags |= MNT_ACLS;
      
              if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) {
                      mntorflags |= MNT_SNAPSHOT;
                      /*
                       * Once we have set the MNT_SNAPSHOT flag, do not
                       * persist "snapshot" in the options list.
                       */
                      vfs_deleteopt(mp->mnt_optnew, "snapshot");
                      vfs_deleteopt(mp->mnt_opt, "snapshot");
              }
      
              if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 &&
                  vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) {
                      /*
                       * Once we have set the restricted PID, do not
                       * persist "fsckpid" in the options list.
                       */
                      vfs_deleteopt(mp->mnt_optnew, "fsckpid");
                      vfs_deleteopt(mp->mnt_opt, "fsckpid");
                      if (mp->mnt_flag & MNT_UPDATE) {
                              if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 &&
                                   vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
                                      vfs_mount_error(mp,
                                          "Checker enable: Must be read-only");
                                      return (EINVAL);
                              }
                      } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
                              vfs_mount_error(mp,
                                  "Checker enable: Must be read-only");
                              return (EINVAL);
                      }
                      /* Set to -1 if we are done */
                      if (fsckpid == 0)
                              fsckpid = -1;
              }
      
              if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) {
                      if (mntorflags & MNT_ACLS) {
                              vfs_mount_error(mp,
                                  "\"acls\" and \"nfsv4acls\" options "
                                  "are mutually exclusive");
                              return (EINVAL);
                      }
                      mntorflags |= MNT_NFS4ACLS;
              }
      
              MNT_ILOCK(mp);
              mp->mnt_flag |= mntorflags;
              MNT_IUNLOCK(mp);
              /*
               * If updating, check whether changing from read-only to
               * read/write; if there is no device name, that's all we do.
               */
              if (mp->mnt_flag & MNT_UPDATE) {
                      ump = VFSTOUFS(mp);
                      fs = ump->um_fs;
                      odevvp = ump->um_odevvp;
                      devvp = ump->um_devvp;
                      if (fsckpid == -1 && ump->um_fsckpid > 0) {
                              if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 ||
                                  (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0)
                                      return (error);
                              g_topology_lock();
                              /*
                               * Return to normal read-only mode.
                               */
                              error = g_access(ump->um_cp, 0, -1, 0);
                              g_topology_unlock();
                              ump->um_fsckpid = 0;
                      }
                      if (fs->fs_ronly == 0 &&
                          vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
                              /*
                               * Flush any dirty data and suspend filesystem.
                               */
                              if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
                                      return (error);
                              error = vfs_write_suspend_umnt(mp);
                              if (error != 0)
                                      return (error);
                              /*
                               * Check for and optionally get rid of files open
                               * for writing.
                               */
                              flags = WRITECLOSE;
                              if (mp->mnt_flag & MNT_FORCE)
                                      flags |= FORCECLOSE;
                              if (MOUNTEDSOFTDEP(mp)) {
                                      error = softdep_flushfiles(mp, flags, td);
                              } else {
                                      error = ffs_flushfiles(mp, flags, td);
                              }
                              if (error) {
                                      vfs_write_resume(mp, 0);
                                      return (error);
                              }
                              if (fs->fs_pendingblocks != 0 ||
                                  fs->fs_pendinginodes != 0) {
                                      printf("WARNING: %s Update error: blocks %jd "
                                          "files %d\n", fs->fs_fsmnt, 
                                          (intmax_t)fs->fs_pendingblocks,
                                          fs->fs_pendinginodes);
                                      fs->fs_pendingblocks = 0;
                                      fs->fs_pendinginodes = 0;
                              }
                              if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
                                      fs->fs_clean = 1;
                              if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
                                      fs->fs_ronly = 0;
                                      fs->fs_clean = 0;
                                      vfs_write_resume(mp, 0);
                                      return (error);
                              }
                              if (MOUNTEDSOFTDEP(mp))
                                      softdep_unmount(mp);
                              g_topology_lock();
                              /*
                               * Drop our write and exclusive access.
                               */
                              g_access(ump->um_cp, 0, -1, -1);
                              g_topology_unlock();
                              fs->fs_ronly = 1;
                              MNT_ILOCK(mp);
                              mp->mnt_flag |= MNT_RDONLY;
                              MNT_IUNLOCK(mp);
                              /*
                               * Allow the writers to note that filesystem
                               * is ro now.
                               */
                              vfs_write_resume(mp, 0);
                      }
                      if ((mp->mnt_flag & MNT_RELOAD) &&
                          (error = ffs_reload(mp, td, 0)) != 0)
                              return (error);
                      if (fs->fs_ronly &&
                          !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
                              /*
                               * If we are running a checker, do not allow upgrade.
                               */
                              if (ump->um_fsckpid > 0) {
                                      vfs_mount_error(mp,
                                          "Active checker, cannot upgrade to write");
                                      return (EINVAL);
                              }
                              /*
                               * If upgrade to read-write by non-root, then verify
                               * that user has necessary permissions on the device.
                               */
                              vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY);
                              error = VOP_ACCESS(odevvp, VREAD | VWRITE,
                                  td->td_ucred, td);
                              if (error)
                                      error = priv_check(td, PRIV_VFS_MOUNT_PERM);
                              VOP_UNLOCK(odevvp);
                              if (error) {
                                      return (error);
                              }
                              fs->fs_flags &= ~FS_UNCLEAN;
                              if (fs->fs_clean == 0) {
                                      fs->fs_flags |= FS_UNCLEAN;
                                      if ((mp->mnt_flag & MNT_FORCE) ||
                                          ((fs->fs_flags &
                                           (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
                                           (fs->fs_flags & FS_DOSOFTDEP))) {
                                              printf("WARNING: %s was not properly "
                                                 "dismounted\n", fs->fs_fsmnt);
                                      } else {
                                              vfs_mount_error(mp,
                                                 "R/W mount of %s denied. %s.%s",
                                                 fs->fs_fsmnt,
                                                 "Filesystem is not clean - run fsck",
                                                 (fs->fs_flags & FS_SUJ) == 0 ? "" :
                                                 " Forced mount will invalidate"
                                                 " journal contents");
                                              return (EPERM);
                                      }
                              }
                              g_topology_lock();
                              /*
                               * Request exclusive write access.
                               */
                              error = g_access(ump->um_cp, 0, 1, 1);
                              g_topology_unlock();
                              if (error)
                                      return (error);
                              if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
                                      return (error);
                              error = vfs_write_suspend_umnt(mp);
                              if (error != 0)
                                      return (error);
                              fs->fs_ronly = 0;
                              MNT_ILOCK(mp);
                              saved_mnt_flag = MNT_RDONLY;
                              if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag &
                                  MNT_ASYNC) != 0)
                                      saved_mnt_flag |= MNT_ASYNC;
                              mp->mnt_flag &= ~saved_mnt_flag;
                              MNT_IUNLOCK(mp);
                              fs->fs_mtime = time_second;
                              /* check to see if we need to start softdep */
                              if ((fs->fs_flags & FS_DOSOFTDEP) &&
                                  (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
                                      fs->fs_ronly = 1;
                                      MNT_ILOCK(mp);
                                      mp->mnt_flag |= saved_mnt_flag;
                                      MNT_IUNLOCK(mp);
                                      vfs_write_resume(mp, 0);
                                      return (error);
                              }
                              fs->fs_clean = 0;
                              if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
                                      fs->fs_ronly = 1;
                                      MNT_ILOCK(mp);
                                      mp->mnt_flag |= saved_mnt_flag;
                                      MNT_IUNLOCK(mp);
                                      vfs_write_resume(mp, 0);
                                      return (error);
                              }
                              if (fs->fs_snapinum[0] != 0)
                                      ffs_snapshot_mount(mp);
                              vfs_write_resume(mp, 0);
                      }
                      /*
                       * Soft updates is incompatible with "async",
                       * so if we are doing softupdates stop the user
                       * from setting the async flag in an update.
                       * Softdep_mount() clears it in an initial mount
                       * or ro->rw remount.
                       */
                      if (MOUNTEDSOFTDEP(mp)) {
                              /* XXX: Reset too late ? */
                              MNT_ILOCK(mp);
                              mp->mnt_flag &= ~MNT_ASYNC;
                              MNT_IUNLOCK(mp);
                      }
                      /*
                       * Keep MNT_ACLS flag if it is stored in superblock.
                       */
                      if ((fs->fs_flags & FS_ACLS) != 0) {
                              /* XXX: Set too late ? */
                              MNT_ILOCK(mp);
                              mp->mnt_flag |= MNT_ACLS;
                              MNT_IUNLOCK(mp);
                      }
      
                      if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
                              /* XXX: Set too late ? */
                              MNT_ILOCK(mp);
                              mp->mnt_flag |= MNT_NFS4ACLS;
                              MNT_IUNLOCK(mp);
                      }
                      /*
                       * If this is a request from fsck to clean up the filesystem,
                       * then allow the specified pid to proceed.
                       */
                      if (fsckpid > 0) {
                              if (ump->um_fsckpid != 0) {
                                      vfs_mount_error(mp,
                                          "Active checker already running on %s",
                                          fs->fs_fsmnt);
                                      return (EINVAL);
                              }
                              KASSERT(MOUNTEDSOFTDEP(mp) == 0,
                                  ("soft updates enabled on read-only file system"));
                              g_topology_lock();
                              /*
                               * Request write access.
                               */
                              error = g_access(ump->um_cp, 0, 1, 0);
                              g_topology_unlock();
                              if (error) {
                                      vfs_mount_error(mp,
                                          "Checker activation failed on %s",
                                          fs->fs_fsmnt);
                                      return (error);
                              }
                              ump->um_fsckpid = fsckpid;
                              if (fs->fs_snapinum[0] != 0)
                                      ffs_snapshot_mount(mp);
                              fs->fs_mtime = time_second;
                              fs->fs_fmod = 1;
                              fs->fs_clean = 0;
                              (void) ffs_sbupdate(ump, MNT_WAIT, 0);
                      }
      
                      /*
                       * If this is a snapshot request, take the snapshot.
                       */
                      if (mp->mnt_flag & MNT_SNAPSHOT)
                              return (ffs_snapshot(mp, fspec));
      
                      /*
                       * Must not call namei() while owning busy ref.
                       */
                      vfs_unbusy(mp);
              }
      
              /*
               * Not an update, or updating the name: look up the name
               * and verify that it refers to a sensible disk device.
               */
              NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
              error = namei(&ndp);
              if ((mp->mnt_flag & MNT_UPDATE) != 0) {
                      /*
                       * Unmount does not start if MNT_UPDATE is set.  Mount
                       * update busies mp before setting MNT_UPDATE.  We
                       * must be able to retain our busy ref succesfully,
                       * without sleep.
                       */
                      error1 = vfs_busy(mp, MBF_NOWAIT);
                      MPASS(error1 == 0);
              }
              if (error != 0)
                      return (error);
              NDFREE(&ndp, NDF_ONLY_PNBUF);
              devvp = ndp.ni_vp;
              if (!vn_isdisk(devvp, &error)) {
                      vput(devvp);
                      return (error);
              }
      
              /*
               * If mount by non-root, then verify that user has necessary
               * permissions on the device.
               */
              accmode = VREAD;
              if ((mp->mnt_flag & MNT_RDONLY) == 0)
                      accmode |= VWRITE;
              error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
              if (error)
                      error = priv_check(td, PRIV_VFS_MOUNT_PERM);
              if (error) {
                      vput(devvp);
                      return (error);
              }
      
              if (mp->mnt_flag & MNT_UPDATE) {
                      /*
                       * Update only
                       *
                       * If it's not the same vnode, or at least the same device
                       * then it's not correct.
                       */
      
                      if (devvp->v_rdev != ump->um_devvp->v_rdev)
                              error = EINVAL;        /* needs translation */
                      vput(devvp);
                      if (error)
                              return (error);
              } else {
                      /*
                       * New mount
                       *
                       * We need the name for the mount point (also used for
                       * "last mounted on") copied in. If an error occurs,
                       * the mount point is discarded by the upper level code.
                       * Note that vfs_mount_alloc() populates f_mntonname for us.
                       */
                      if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
                              vrele(devvp);
                              return (error);
                      }
                      if (fsckpid > 0) {
                              KASSERT(MOUNTEDSOFTDEP(mp) == 0,
                                  ("soft updates enabled on read-only file system"));
                              ump = VFSTOUFS(mp);
                              fs = ump->um_fs;
                              g_topology_lock();
                              /*
                               * Request write access.
                               */
                              error = g_access(ump->um_cp, 0, 1, 0);
                              g_topology_unlock();
                              if (error) {
                                      printf("WARNING: %s: Checker activation "
                                          "failed\n", fs->fs_fsmnt);
                              } else { 
                                      ump->um_fsckpid = fsckpid;
                                      if (fs->fs_snapinum[0] != 0)
                                              ffs_snapshot_mount(mp);
                                      fs->fs_mtime = time_second;
                                      fs->fs_clean = 0;
                                      (void) ffs_sbupdate(ump, MNT_WAIT, 0);
                              }
                      }
              }
              vfs_mountedfrom(mp, fspec);
              return (0);
      }
      
      /*
       * Compatibility with old mount system call.
       */
      
      static int
      ffs_cmount(struct mntarg *ma, void *data, uint64_t flags)
      {
              struct ufs_args args;
              struct export_args exp;
              int error;
      
              if (data == NULL)
                      return (EINVAL);
              error = copyin(data, &args, sizeof args);
              if (error)
                      return (error);
              vfs_oexport_conv(&args.export, &exp);
      
              ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
              ma = mount_arg(ma, "export", &exp, sizeof(exp));
              error = kernel_mount(ma, flags);
      
              return (error);
      }
      
      /*
       * Reload all incore data for a filesystem (used after running fsck on
       * the root filesystem and finding things to fix). If the 'force' flag
       * is 0, the filesystem must be mounted read-only.
       *
       * Things to do to update the mount:
       *        1) invalidate all cached meta-data.
       *        2) re-read superblock from disk.
       *        3) re-read summary information from disk.
       *        4) invalidate all inactive vnodes.
       *        5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary
       *           writers, if requested.
       *        6) invalidate all cached file data.
       *        7) re-read inode data for all active vnodes.
       */
      int
      ffs_reload(struct mount *mp, struct thread *td, int flags)
      {
              struct vnode *vp, *mvp, *devvp;
              struct inode *ip;
              void *space;
              struct buf *bp;
              struct fs *fs, *newfs;
              struct ufsmount *ump;
              ufs2_daddr_t sblockloc;
              int i, blks, error;
              u_long size;
              int32_t *lp;
      
              ump = VFSTOUFS(mp);
      
              MNT_ILOCK(mp);
              if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) {
                      MNT_IUNLOCK(mp);
                      return (EINVAL);
              }
              MNT_IUNLOCK(mp);
              
              /*
               * Step 1: invalidate all cached meta-data.
               */
              devvp = VFSTOUFS(mp)->um_devvp;
              vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
              if (vinvalbuf(devvp, 0, 0, 0) != 0)
                      panic("ffs_reload: dirty1");
              VOP_UNLOCK(devvp);
      
              /*
               * Step 2: re-read superblock from disk.
               */
              fs = VFSTOUFS(mp)->um_fs;
              if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
                  NOCRED, &bp)) != 0)
                      return (error);
              newfs = (struct fs *)bp->b_data;
              if ((newfs->fs_magic != FS_UFS1_MAGIC &&
                   newfs->fs_magic != FS_UFS2_MAGIC) ||
                  newfs->fs_bsize > MAXBSIZE ||
                  newfs->fs_bsize < sizeof(struct fs)) {
                              brelse(bp);
                              return (EIO);                /* XXX needs translation */
              }
              /*
               * Copy pointer fields back into superblock before copying in        XXX
               * new superblock. These should really be in the ufsmount.        XXX
               * Note that important parameters (eg fs_ncg) are unchanged.
               */
              newfs->fs_csp = fs->fs_csp;
              newfs->fs_maxcluster = fs->fs_maxcluster;
              newfs->fs_contigdirs = fs->fs_contigdirs;
              newfs->fs_active = fs->fs_active;
              newfs->fs_ronly = fs->fs_ronly;
              sblockloc = fs->fs_sblockloc;
              bcopy(newfs, fs, (u_int)fs->fs_sbsize);
              brelse(bp);
              mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
              ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
              UFS_LOCK(ump);
              if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
                      printf("WARNING: %s: reload pending error: blocks %jd "
                          "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
                          fs->fs_pendinginodes);
                      fs->fs_pendingblocks = 0;
                      fs->fs_pendinginodes = 0;
              }
              UFS_UNLOCK(ump);
      
              /*
               * Step 3: re-read summary information from disk.
               */
              size = fs->fs_cssize;
              blks = howmany(size, fs->fs_fsize);
              if (fs->fs_contigsumsize > 0)
                      size += fs->fs_ncg * sizeof(int32_t);
              size += fs->fs_ncg * sizeof(u_int8_t);
              free(fs->fs_csp, M_UFSMNT);
              space = malloc(size, M_UFSMNT, M_WAITOK);
              fs->fs_csp = space;
              for (i = 0; i < blks; i += fs->fs_frag) {
                      size = fs->fs_bsize;
                      if (i + fs->fs_frag > blks)
                              size = (blks - i) * fs->fs_fsize;
                      error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
                          NOCRED, &bp);
                      if (error)
                              return (error);
                      bcopy(bp->b_data, space, (u_int)size);
                      space = (char *)space + size;
                      brelse(bp);
              }
              /*
               * We no longer know anything about clusters per cylinder group.
               */
              if (fs->fs_contigsumsize > 0) {
                      fs->fs_maxcluster = lp = space;
                      for (i = 0; i < fs->fs_ncg; i++)
                              *lp++ = fs->fs_contigsumsize;
                      space = lp;
              }
              size = fs->fs_ncg * sizeof(u_int8_t);
              fs->fs_contigdirs = (u_int8_t *)space;
              bzero(fs->fs_contigdirs, size);
              if ((flags & FFSR_UNSUSPEND) != 0) {
                      MNT_ILOCK(mp);
                      mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
                      wakeup(&mp->mnt_flag);
                      MNT_IUNLOCK(mp);
              }
      
      loop:
              MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
                      /*
                       * Skip syncer vnode.
                       */
                      if (vp->v_type == VNON) {
                              VI_UNLOCK(vp);
                              continue;
                      }
                      /*
                       * Step 4: invalidate all cached file data.
                       */
                      if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
                              MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
                              goto loop;
                      }
                      if (vinvalbuf(vp, 0, 0, 0))
                              panic("ffs_reload: dirty2");
                      /*
                       * Step 5: re-read inode data for all active vnodes.
                       */
                      ip = VTOI(vp);
                      error =
                          bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
                          (int)fs->fs_bsize, NOCRED, &bp);
                      if (error) {
                              vput(vp);
                              MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
                              return (error);
                      }
                      if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) {
                              brelse(bp);
                              vput(vp);
                              MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
                              return (error);
                      }
                      ip->i_effnlink = ip->i_nlink;
                      brelse(bp);
                      vput(vp);
              }
              return (0);
      }
      
      /*
       * Common code for mount and mountroot
       */
      static int
      ffs_mountfs(odevvp, mp, td)
              struct vnode *odevvp;
              struct mount *mp;
              struct thread *td;
      {
              struct ufsmount *ump;
              struct fs *fs;
              struct cdev *dev;
              int error, i, len, ronly;
              struct ucred *cred;
              struct g_consumer *cp;
              struct mount *nmp;
              struct vnode *devvp;
              int candelete, canspeedup;
              off_t loc;
      
              fs = NULL;
              ump = NULL;
              cred = td ? td->td_ucred : NOCRED;
              ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
      
              devvp = mntfs_allocvp(mp, odevvp);
              VOP_UNLOCK(odevvp);
              KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
              dev = devvp->v_rdev;
              if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
                  (uintptr_t)mp) == 0) {
                      mntfs_freevp(devvp);
                      return (EBUSY);
              }
              g_topology_lock();
              error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
              g_topology_unlock();
              if (error != 0) {
                      atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
                      mntfs_freevp(devvp);
                      return (error);
              }
              dev_ref(dev);
              devvp->v_bufobj.bo_ops = &ffs_ops;
              BO_LOCK(&odevvp->v_bufobj);
              odevvp->v_bufobj.bo_flag |= BO_NOBUFS;
              BO_UNLOCK(&odevvp->v_bufobj);
              if (dev->si_iosize_max != 0)
                      mp->mnt_iosize_max = dev->si_iosize_max;
              if (mp->mnt_iosize_max > MAXPHYS)
                      mp->mnt_iosize_max = MAXPHYS;
              if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
                      error = EINVAL;
                      vfs_mount_error(mp,
                          "Invalid sectorsize %d for superblock size %d",
                          cp->provider->sectorsize, SBLOCKSIZE);
                      goto out;
              }
              /* fetch the superblock and summary information */
              loc = STDSB;
              if ((mp->mnt_flag & MNT_ROOTFS) != 0)
                      loc = STDSB_NOHASHFAIL;
              if ((error = ffs_sbget(devvp, &fs, loc, M_UFSMNT, ffs_use_bread)) != 0)
                      goto out;
              /* none of these types of check-hashes are maintained by this kernel */
              fs->fs_metackhash &= ~(CK_INDIR | CK_DIR);
              /* no support for any undefined flags */
              fs->fs_flags &= FS_SUPPORTED;
              fs->fs_flags &= ~FS_UNCLEAN;
              if (fs->fs_clean == 0) {
                      fs->fs_flags |= FS_UNCLEAN;
                      if (ronly || (mp->mnt_flag & MNT_FORCE) ||
                          ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
                           (fs->fs_flags & FS_DOSOFTDEP))) {
                              printf("WARNING: %s was not properly dismounted\n",
                                  fs->fs_fsmnt);
                      } else {
                              vfs_mount_error(mp, "R/W mount of %s denied. %s%s",
                                  fs->fs_fsmnt, "Filesystem is not clean - run fsck.",
                                  (fs->fs_flags & FS_SUJ) == 0 ? "" :
                                  " Forced mount will invalidate journal contents");
                              error = EPERM;
                              goto out;
                      }
                      if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
                          (mp->mnt_flag & MNT_FORCE)) {
                              printf("WARNING: %s: lost blocks %jd files %d\n",
                                  fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
                                  fs->fs_pendinginodes);
                              fs->fs_pendingblocks = 0;
                              fs->fs_pendinginodes = 0;
                      }
              }
              if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
                      printf("WARNING: %s: mount pending error: blocks %jd "
                          "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
                          fs->fs_pendinginodes);
                      fs->fs_pendingblocks = 0;
                      fs->fs_pendinginodes = 0;
              }
              if ((fs->fs_flags & FS_GJOURNAL) != 0) {
      #ifdef UFS_GJOURNAL
                      /*
                       * Get journal provider name.
                       */
                      len = 1024;
                      mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK);
                      if (g_io_getattr("GJOURNAL::provider", cp, &len,
                          mp->mnt_gjprovider) == 0) {
                              mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len,
                                  M_UFSMNT, M_WAITOK);
                              MNT_ILOCK(mp);
                              mp->mnt_flag |= MNT_GJOURNAL;
                              MNT_IUNLOCK(mp);
                      } else {
                              printf("WARNING: %s: GJOURNAL flag on fs "
                                  "but no gjournal provider below\n",
                                  mp->mnt_stat.f_mntonname);
                              free(mp->mnt_gjprovider, M_UFSMNT);
                              mp->mnt_gjprovider = NULL;
                      }
      #else
                      printf("WARNING: %s: GJOURNAL flag on fs but no "
                          "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname);
      #endif
              } else {
                      mp->mnt_gjprovider = NULL;
              }
              ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
              ump->um_cp = cp;
              ump->um_bo = &devvp->v_bufobj;
              ump->um_fs = fs;
              if (fs->fs_magic == FS_UFS1_MAGIC) {
                      ump->um_fstype = UFS1;
                      ump->um_balloc = ffs_balloc_ufs1;
              } else {
                      ump->um_fstype = UFS2;
                      ump->um_balloc = ffs_balloc_ufs2;
              }
              ump->um_blkatoff = ffs_blkatoff;
              ump->um_truncate = ffs_truncate;
              ump->um_update = ffs_update;
              ump->um_valloc = ffs_valloc;
              ump->um_vfree = ffs_vfree;
              ump->um_ifree = ffs_ifree;
              ump->um_rdonly = ffs_rdonly;
              ump->um_snapgone = ffs_snapgone;
              if ((mp->mnt_flag & MNT_UNTRUSTED) != 0)
                      ump->um_check_blkno = ffs_check_blkno;
              else
                      ump->um_check_blkno = NULL;
              mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
              ffs_oldfscompat_read(fs, ump, fs->fs_sblockloc);
              fs->fs_ronly = ronly;
              fs->fs_active = NULL;
              mp->mnt_data = ump;
              mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
              mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
              nmp = NULL;
              if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
                  (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
                      if (nmp)
                              vfs_rel(nmp);
                      vfs_getnewfsid(mp);
              }
              mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
              MNT_ILOCK(mp);
              mp->mnt_flag |= MNT_LOCAL;
              MNT_IUNLOCK(mp);
              if ((fs->fs_flags & FS_MULTILABEL) != 0) {
      #ifdef MAC
                      MNT_ILOCK(mp);
                      mp->mnt_flag |= MNT_MULTILABEL;
                      MNT_IUNLOCK(mp);
      #else
                      printf("WARNING: %s: multilabel flag on fs but "
                          "no MAC support\n", mp->mnt_stat.f_mntonname);
      #endif
              }
              if ((fs->fs_flags & FS_ACLS) != 0) {
      #ifdef UFS_ACL
                      MNT_ILOCK(mp);
      
                      if (mp->mnt_flag & MNT_NFS4ACLS)
                              printf("WARNING: %s: ACLs flag on fs conflicts with "
                                  "\"nfsv4acls\" mount option; option ignored\n",
                                  mp->mnt_stat.f_mntonname);
                      mp->mnt_flag &= ~MNT_NFS4ACLS;
                      mp->mnt_flag |= MNT_ACLS;
      
                      MNT_IUNLOCK(mp);
      #else
                      printf("WARNING: %s: ACLs flag on fs but no ACLs support\n",
                          mp->mnt_stat.f_mntonname);
      #endif
              }
              if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
      #ifdef UFS_ACL
                      MNT_ILOCK(mp);
      
                      if (mp->mnt_flag & MNT_ACLS)
                              printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts "
                                  "with \"acls\" mount option; option ignored\n",
                                  mp->mnt_stat.f_mntonname);
                      mp->mnt_flag &= ~MNT_ACLS;
                      mp->mnt_flag |= MNT_NFS4ACLS;
      
                      MNT_IUNLOCK(mp);
      #else
                      printf("WARNING: %s: NFSv4 ACLs flag on fs but no "
                          "ACLs support\n", mp->mnt_stat.f_mntonname);
      #endif
              }
              if ((fs->fs_flags & FS_TRIM) != 0) {
                      len = sizeof(int);
                      if (g_io_getattr("GEOM::candelete", cp, &len,
                          &candelete) == 0) {
                              if (candelete)
                                      ump->um_flags |= UM_CANDELETE;
                              else
                                      printf("WARNING: %s: TRIM flag on fs but disk "
                                          "does not support TRIM\n",
                                          mp->mnt_stat.f_mntonname);
                      } else {
                              printf("WARNING: %s: TRIM flag on fs but disk does "
                                  "not confirm that it supports TRIM\n",
                                  mp->mnt_stat.f_mntonname);
                      }
                      if (((ump->um_flags) & UM_CANDELETE) != 0) {
                              ump->um_trim_tq = taskqueue_create("trim", M_WAITOK,
                                  taskqueue_thread_enqueue, &ump->um_trim_tq);
                              taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
                                  "%s trim", mp->mnt_stat.f_mntonname);
                              ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
                                  &ump->um_trimlisthashsize);
                      }
              }
      
              len = sizeof(int);
              if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) {
                      if (canspeedup)
                              ump->um_flags |= UM_CANSPEEDUP;
              }
      
              ump->um_mountp = mp;
              ump->um_dev = dev;
              ump->um_devvp = devvp;
              ump->um_odevvp = odevvp;
              ump->um_nindir = fs->fs_nindir;
              ump->um_bptrtodb = fs->fs_fsbtodb;
              ump->um_seqinc = fs->fs_frag;
              for (i = 0; i < MAXQUOTAS; i++)
                      ump->um_quotas[i] = NULLVP;
      #ifdef UFS_EXTATTR
              ufs_extattr_uepm_init(&ump->um_extattr);
      #endif
              /*
               * Set FS local "last mounted on" information (NULL pad)
               */
              bzero(fs->fs_fsmnt, MAXMNTLEN);
              strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
              mp->mnt_stat.f_iosize = fs->fs_bsize;
      
              if (mp->mnt_flag & MNT_ROOTFS) {
                      /*
                       * Root mount; update timestamp in mount structure.
                       * this will be used by the common root mount code
                       * to update the system clock.
                       */
                      mp->mnt_time = fs->fs_time;
              }
      
              if (ronly == 0) {
                      fs->fs_mtime = time_second;
                      if ((fs->fs_flags & FS_DOSOFTDEP) &&
                          (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
                              ffs_flushfiles(mp, FORCECLOSE, td);
                              goto out;
                      }
                      if (fs->fs_snapinum[0] != 0)
                              ffs_snapshot_mount(mp);
                      fs->fs_fmod = 1;
                      fs->fs_clean = 0;
                      (void) ffs_sbupdate(ump, MNT_WAIT, 0);
              }
              /*
               * Initialize filesystem state information in mount struct.
               */
              MNT_ILOCK(mp);
              mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
                  MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE;
              MNT_IUNLOCK(mp);
      #ifdef UFS_EXTATTR
      #ifdef UFS_EXTATTR_AUTOSTART
              /*
               *
               * Auto-starting does the following:
               *        - check for /.attribute in the fs, and extattr_start if so
               *        - for each file in .attribute, enable that file with
               *           an attribute of the same name.
               * Not clear how to report errors -- probably eat them.
               * This would all happen while the filesystem was busy/not
               * available, so would effectively be "atomic".
               */
              (void) ufs_extattr_autostart(mp, td);
      #endif /* !UFS_EXTATTR_AUTOSTART */
      #endif /* !UFS_EXTATTR */
              return (0);
      out:
              if (fs != NULL) {
                      free(fs->fs_csp, M_UFSMNT);
                      free(fs, M_UFSMNT);
              }
              if (cp != NULL) {
                      g_topology_lock();
                      g_vfs_close(cp);
                      g_topology_unlock();
              }
              if (ump) {
                      mtx_destroy(UFS_MTX(ump));
                      if (mp->mnt_gjprovider != NULL) {
                              free(mp->mnt_gjprovider, M_UFSMNT);
                              mp->mnt_gjprovider = NULL;
                      }
                      free(ump, M_UFSMNT);
                      mp->mnt_data = NULL;
              }
              BO_LOCK(&odevvp->v_bufobj);
              odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
              BO_UNLOCK(&odevvp->v_bufobj);
              atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
              mntfs_freevp(devvp);
              dev_rel(dev);
              return (error);
      }
      
      /*
       * A read function for use by filesystem-layer routines.
       */
      static int
      ffs_use_bread(void *devfd, off_t loc, void **bufp, int size)
      {
              struct buf *bp;
              int error;
      
              KASSERT(*bufp == NULL, ("ffs_use_bread: non-NULL *bufp %p\n", *bufp));
              *bufp = malloc(size, M_UFSMNT, M_WAITOK);
              if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED,
                  &bp)) != 0)
                      return (error);
              bcopy(bp->b_data, *bufp, size);
              bp->b_flags |= B_INVAL | B_NOCACHE;
              brelse(bp);
              return (0);
      }
      
      #include <sys/sysctl.h>
      static int bigcgs = 0;
      SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
      
      /*
       * Sanity checks for loading old filesystem superblocks.
       * See ffs_oldfscompat_write below for unwound actions.
       *
       * XXX - Parts get retired eventually.
       * Unfortunately new bits get added.
       */
      static void
      ffs_oldfscompat_read(fs, ump, sblockloc)
              struct fs *fs;
              struct ufsmount *ump;
              ufs2_daddr_t sblockloc;
      {
              off_t maxfilesize;
      
              /*
               * If not yet done, update fs_flags location and value of fs_sblockloc.
               */
              if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
                      fs->fs_flags = fs->fs_old_flags;
                      fs->fs_old_flags |= FS_FLAGS_UPDATED;
                      fs->fs_sblockloc = sblockloc;
              }
              /*
               * If not yet done, update UFS1 superblock with new wider fields.
               */
              if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
                      fs->fs_maxbsize = fs->fs_bsize;
                      fs->fs_time = fs->fs_old_time;
                      fs->fs_size = fs->fs_old_size;
                      fs->fs_dsize = fs->fs_old_dsize;
                      fs->fs_csaddr = fs->fs_old_csaddr;
                      fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
                      fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
                      fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
                      fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
              }
              if (fs->fs_magic == FS_UFS1_MAGIC &&
                  fs->fs_old_inodefmt < FS_44INODEFMT) {
                      fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
                      fs->fs_qbmask = ~fs->fs_bmask;
                      fs->fs_qfmask = ~fs->fs_fmask;
              }
              if (fs->fs_magic == FS_UFS1_MAGIC) {
                      ump->um_savedmaxfilesize = fs->fs_maxfilesize;
                      maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
                      if (fs->fs_maxfilesize > maxfilesize)
                              fs->fs_maxfilesize = maxfilesize;
              }
              /* Compatibility for old filesystems */
              if (fs->fs_avgfilesize <= 0)
                      fs->fs_avgfilesize = AVFILESIZ;
              if (fs->fs_avgfpdir <= 0)
                      fs->fs_avgfpdir = AFPDIR;
              if (bigcgs) {
                      fs->fs_save_cgsize = fs->fs_cgsize;
                      fs->fs_cgsize = fs->fs_bsize;
              }
      }
      
      /*
       * Unwinding superblock updates for old filesystems.
       * See ffs_oldfscompat_read above for details.
       *
       * XXX - Parts get retired eventually.
       * Unfortunately new bits get added.
       */
      void
      ffs_oldfscompat_write(fs, ump)
              struct fs *fs;
              struct ufsmount *ump;
      {
      
              /*
               * Copy back UFS2 updated fields that UFS1 inspects.
               */
  622         if (fs->fs_magic == FS_UFS1_MAGIC) {
                      fs->fs_old_time = fs->fs_time;
                      fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
                      fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
                      fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
                      fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
                      fs->fs_maxfilesize = ump->um_savedmaxfilesize;
              }
  622         if (bigcgs) {
                      fs->fs_cgsize = fs->fs_save_cgsize;
                      fs->fs_save_cgsize = 0;
              }
      }
      
      /*
       * unmount system call
       */
      static int
      ffs_unmount(mp, mntflags)
              struct mount *mp;
              int mntflags;
      {
              struct thread *td;
              struct ufsmount *ump = VFSTOUFS(mp);
              struct fs *fs;
              int error, flags, susp;
      #ifdef UFS_EXTATTR
              int e_restart;
      #endif
      
              flags = 0;
              td = curthread;
              fs = ump->um_fs;
              susp = 0;
              if (mntflags & MNT_FORCE) {
                      flags |= FORCECLOSE;
                      susp = fs->fs_ronly == 0;
              }
      #ifdef UFS_EXTATTR
              if ((error = ufs_extattr_stop(mp, td))) {
                      if (error != EOPNOTSUPP)
                              printf("WARNING: unmount %s: ufs_extattr_stop "
                                  "returned errno %d\n", mp->mnt_stat.f_mntonname,
                                  error);
                      e_restart = 0;
              } else {
                      ufs_extattr_uepm_destroy(&ump->um_extattr);
                      e_restart = 1;
              }
      #endif
              if (susp) {
                      error = vfs_write_suspend_umnt(mp);
                      if (error != 0)
                              goto fail1;
              }
              if (MOUNTEDSOFTDEP(mp))
                      error = softdep_flushfiles(mp, flags, td);
              else
                      error = ffs_flushfiles(mp, flags, td);
              if (error != 0 && error != ENXIO)
                      goto fail;
      
              UFS_LOCK(ump);
              if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
                      printf("WARNING: unmount %s: pending error: blocks %jd "
                          "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
                          fs->fs_pendinginodes);
                      fs->fs_pendingblocks = 0;
                      fs->fs_pendinginodes = 0;
              }
              UFS_UNLOCK(ump);
              if (MOUNTEDSOFTDEP(mp))
                      softdep_unmount(mp);
              if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) {
                      fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
                      error = ffs_sbupdate(ump, MNT_WAIT, 0);
                      if (error && error != ENXIO) {
                              fs->fs_clean = 0;
                              goto fail;
                      }
              }
              if (susp)
                      vfs_write_resume(mp, VR_START_WRITE);
              if (ump->um_trim_tq != NULL) {
                      while (ump->um_trim_inflight != 0)
                              pause("ufsutr", hz);
                      taskqueue_drain_all(ump->um_trim_tq);
                      taskqueue_free(ump->um_trim_tq);
                      free (ump->um_trimhash, M_TRIM);
              }
              g_topology_lock();
              if (ump->um_fsckpid > 0) {
                      /*
                       * Return to normal read-only mode.
                       */
                      error = g_access(ump->um_cp, 0, -1, 0);
                      ump->um_fsckpid = 0;
              }
              g_vfs_close(ump->um_cp);
              g_topology_unlock();
              BO_LOCK(&ump->um_odevvp->v_bufobj);
              ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
              BO_UNLOCK(&ump->um_odevvp->v_bufobj);
              atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
              mntfs_freevp(ump->um_devvp);
              vrele(ump->um_odevvp);
              dev_rel(ump->um_dev);
              mtx_destroy(UFS_MTX(ump));
              if (mp->mnt_gjprovider != NULL) {
                      free(mp->mnt_gjprovider, M_UFSMNT);
                      mp->mnt_gjprovider = NULL;
              }
              free(fs->fs_csp, M_UFSMNT);
              free(fs, M_UFSMNT);
              free(ump, M_UFSMNT);
              mp->mnt_data = NULL;
              MNT_ILOCK(mp);
              mp->mnt_flag &= ~MNT_LOCAL;
              MNT_IUNLOCK(mp);
              if (td->td_su == mp) {
                      td->td_su = NULL;
                      vfs_rel(mp);
              }
              return (error);
      
      fail:
              if (susp)
                      vfs_write_resume(mp, VR_START_WRITE);
      fail1:
      #ifdef UFS_EXTATTR
              if (e_restart) {
                      ufs_extattr_uepm_init(&ump->um_extattr);
      #ifdef UFS_EXTATTR_AUTOSTART
                      (void) ufs_extattr_autostart(mp, td);
      #endif
              }
      #endif
      
              return (error);
      }
      
      /*
       * Flush out all the files in a filesystem.
       */
      int
      ffs_flushfiles(mp, flags, td)
              struct mount *mp;
              int flags;
              struct thread *td;
      {
              struct ufsmount *ump;
              int qerror, error;
      
              ump = VFSTOUFS(mp);
              qerror = 0;
      #ifdef QUOTA
              if (mp->mnt_flag & MNT_QUOTA) {
                      int i;
                      error = vflush(mp, 0, SKIPSYSTEM|flags, td);
                      if (error)
                              return (error);
                      for (i = 0; i < MAXQUOTAS; i++) {
                              error = quotaoff(td, mp, i);
                              if (error != 0) {
                                      if ((flags & EARLYFLUSH) == 0)
                                              return (error);
                                      else
                                              qerror = error;
                              }
                      }
      
                      /*
                       * Here we fall through to vflush again to ensure that
                       * we have gotten rid of all the system vnodes, unless
                       * quotas must not be closed.
                       */
              }
      #endif
              ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
              if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
                      if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
                              return (error);
                      ffs_snapshot_unmount(mp);
                      flags |= FORCECLOSE;
                      /*
                       * Here we fall through to vflush again to ensure
                       * that we have gotten rid of all the system vnodes.
                       */
              }
      
              /*
               * Do not close system files if quotas were not closed, to be
               * able to sync the remaining dquots.  The freeblks softupdate
               * workitems might hold a reference on a dquot, preventing
               * quotaoff() from completing.  Next round of
               * softdep_flushworklist() iteration should process the
               * blockers, allowing the next run of quotaoff() to finally
               * flush held dquots.
               *
               * Otherwise, flush all the files.
               */
              if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0)
                      return (error);
      
              /*
               * Flush filesystem metadata.
               */
              vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
              error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
              VOP_UNLOCK(ump->um_devvp);
              return (error);
      }
      
      /*
       * Get filesystem statistics.
       */
      static int
      ffs_statfs(mp, sbp)
              struct mount *mp;
              struct statfs *sbp;
      {
              struct ufsmount *ump;
              struct fs *fs;
      
              ump = VFSTOUFS(mp);
              fs = ump->um_fs;
              if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
                      panic("ffs_statfs");
              sbp->f_version = STATFS_VERSION;
              sbp->f_bsize = fs->fs_fsize;
              sbp->f_iosize = fs->fs_bsize;
              sbp->f_blocks = fs->fs_dsize;
              UFS_LOCK(ump);
              sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
                  fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
              sbp->f_bavail = freespace(fs, fs->fs_minfree) +
                  dbtofsb(fs, fs->fs_pendingblocks);
              sbp->f_files =  fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO;
              sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
              UFS_UNLOCK(ump);
              sbp->f_namemax = UFS_MAXNAMLEN;
              return (0);
      }
      
      static bool
      sync_doupdate(struct inode *ip)
      {
      
              return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED |
                  IN_UPDATE)) != 0);
      }
      
      static int
      ffs_sync_lazy_filter(struct vnode *vp, void *arg __unused)
      {
              struct inode *ip;
      
              /*
               * Flags are safe to access because ->v_data invalidation
               * is held off by listmtx.
               */
              if (vp->v_type == VNON)
                      return (false);
              ip = VTOI(vp);
              if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0)
                      return (false);
              return (true);
      }
      
      /*
       * For a lazy sync, we only care about access times, quotas and the
       * superblock.  Other filesystem changes are already converted to
       * cylinder group blocks or inode blocks updates and are written to
       * disk by syncer.
       */
      static int
      ffs_sync_lazy(mp)
           struct mount *mp;
      {
              struct vnode *mvp, *vp;
              struct inode *ip;
              struct thread *td;
              int allerror, error;
      
              allerror = 0;
              td = curthread;
              if ((mp->mnt_flag & MNT_NOATIME) != 0) {
      #ifdef QUOTA
                      qsync(mp);
      #endif
                      goto sbupdate;
              }
              MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) {
                      if (vp->v_type == VNON) {
                              VI_UNLOCK(vp);
                              continue;
                      }
                      ip = VTOI(vp);
      
                      /*
                       * The IN_ACCESS flag is converted to IN_MODIFIED by
                       * ufs_close() and ufs_getattr() by the calls to
                       * ufs_itimes_locked(), without subsequent UFS_UPDATE().
                       * Test also all the other timestamp flags too, to pick up
                       * any other cases that could be missed.
                       */
                      if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) {
                              VI_UNLOCK(vp);
                              continue;
                      }
                      if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
                          td)) != 0)
                              continue;
      #ifdef QUOTA
                      qsyncvp(vp);
      #endif
                      if (sync_doupdate(ip))
                              error = ffs_update(vp, 0);
                      if (error != 0)
                              allerror = error;
                      vput(vp);
              }
      sbupdate:
              if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 &&
                  (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0)
                      allerror = error;
              return (allerror);
      }
      
      /*
       * Go through the disk queues to initiate sandbagged IO;
       * go through the inodes to write those that have been modified;
       * initiate the writing of the super block if it has been modified.
       *
       * Note: we are always called with the filesystem marked busy using
       * vfs_busy().
       */
      static int
      ffs_sync(mp, waitfor)
              struct mount *mp;
              int waitfor;
  624 {
              struct vnode *mvp, *vp, *devvp;
              struct thread *td;
              struct inode *ip;
              struct ufsmount *ump = VFSTOUFS(mp);
              struct fs *fs;
              int error, count, lockreq, allerror = 0;
              int suspend;
              int suspended;
              int secondary_writes;
              int secondary_accwrites;
              int softdep_deps;
              int softdep_accdeps;
              struct bufobj *bo;
      
              suspend = 0;
              suspended = 0;
              td = curthread;
              fs = ump->um_fs;
  624         if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0)
                      panic("%s: ffs_sync: modification on read-only filesystem",
                          fs->fs_fsmnt);
  624         if (waitfor == MNT_LAZY) {
                      if (!rebooting)
                              return (ffs_sync_lazy(mp));
                      waitfor = MNT_NOWAIT;
              }
      
              /*
               * Write back each (modified) inode.
               */
              lockreq = LK_EXCLUSIVE | LK_NOWAIT;
              if (waitfor == MNT_SUSPEND) {
                      suspend = 1;
                      waitfor = MNT_WAIT;
              }
              if (waitfor == MNT_WAIT)
                      lockreq = LK_EXCLUSIVE;
              lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
      loop:
              /* Grab snapshot of secondary write counts */
              MNT_ILOCK(mp);
              secondary_writes = mp->mnt_secondary_writes;
              secondary_accwrites = mp->mnt_secondary_accwrites;
              MNT_IUNLOCK(mp);
      
              /* Grab snapshot of softdep dependency counts */
              softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
      
  624         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
                      /*
                       * Depend on the vnode interlock to keep things stable enough
                       * for a quick test.  Since there might be hundreds of
                       * thousands of vnodes, we cannot afford even a subroutine
                       * call unless there's a good chance that we have work to do.
                       */
                      if (vp->v_type == VNON) {
  624                         VI_UNLOCK(vp);
                              continue;
                      }
                      ip = VTOI(vp);
  623                 if ((ip->i_flag &
  622                     (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
                          vp->v_bufobj.bo_dirty.bv_cnt == 0) {
  624                         VI_UNLOCK(vp);
                              continue;
                      }
  530                 if ((error = vget(vp, lockreq, td)) != 0) {
                              if (error == ENOENT || error == ENOLCK) {
                                      MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
                                      goto loop;
                              }
                              continue;
                      }
      #ifdef QUOTA
  624                 qsyncvp(vp);
      #endif
                      if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0)
                              allerror = error;
                      vput(vp);
              }
              /*
               * Force stale filesystem control information to be flushed.
               */
  623         if (waitfor == MNT_WAIT || rebooting) {
                      if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
                              allerror = error;
                      /* Flushed work items may create new vnodes to clean */
                      if (allerror == 0 && count)
                              goto loop;
              }
      
              devvp = ump->um_devvp;
              bo = &devvp->v_bufobj;
              BO_LOCK(bo);
  623         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
                      BO_UNLOCK(bo);
                      vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                      error = VOP_FSYNC(devvp, waitfor, td);
                      VOP_UNLOCK(devvp);
  622                 if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN))
                              error = ffs_sbupdate(ump, waitfor, 0);
                      if (error != 0)
                              allerror = error;
  622                 if (allerror == 0 && waitfor == MNT_WAIT)
                              goto loop;
              } else if (suspend != 0) {
                      if (softdep_check_suspend(mp,
                                                devvp,
                                                softdep_deps,
                                                softdep_accdeps,
                                                secondary_writes,
                                                secondary_accwrites) != 0) {
                              MNT_IUNLOCK(mp);
                              goto loop;        /* More work needed */
                      }
                      mtx_assert(MNT_MTX(mp), MA_OWNED);
                      mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
                      MNT_IUNLOCK(mp);
                      suspended = 1;
              } else
                      BO_UNLOCK(bo);
              /*
               * Write back modified superblock.
               */
  622         if (fs->fs_fmod != 0 &&
                  (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
                      allerror = error;
              return (allerror);
      }
      
      int
      ffs_vget(mp, ino, flags, vpp)
              struct mount *mp;
              ino_t ino;
              int flags;
              struct vnode **vpp;
 2707 {
              return (ffs_vgetf(mp, ino, flags, vpp, 0));
      }
      
      int
      ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
              struct mount *mp;
              ino_t ino;
              int flags;
              struct vnode **vpp;
              int ffs_flags;
 3253 {
              struct fs *fs;
              struct inode *ip;
              struct ufsmount *ump;
              struct buf *bp;
              struct vnode *vp;
              int error;
      
              MPASS((ffs_flags & FFSV_REPLACE) == 0 || (flags & LK_EXCLUSIVE) != 0);
      
              error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
   14         if (error != 0)
                      return (error);
  723         if (*vpp != NULL) {
 2835                 if ((ffs_flags & FFSV_REPLACE) == 0)
                              return (0);
                      vgone(*vpp);
                      vput(*vpp);
              }
      
              /*
               * We must promote to an exclusive lock for vnode creation.  This
               * can happen if lookup is passed LOCKSHARED.
               */
              if ((flags & LK_TYPE_MASK) == LK_SHARED) {
                      flags &= ~LK_TYPE_MASK;
                      flags |= LK_EXCLUSIVE;
              }
      
              /*
               * We do not lock vnode creation as it is believed to be too
               * expensive for such rare case as simultaneous creation of vnode
               * for same ino by different processes. We just allow them to race
               * and check later to decide who wins. Let the race begin!
               */
      
              ump = VFSTOUFS(mp);
              fs = ump->um_fs;
              ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
      
              /* Allocate a new vnode/inode. */
              error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ?
                  &ffs_vnodeops1 : &ffs_vnodeops2, &vp);
              if (error) {
                      *vpp = NULL;
                      uma_zfree(uma_inode, ip);
                      return (error);
              }
              /*
               * FFS supports recursive locking.
               */
              lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
              VN_LOCK_AREC(vp);
              vp->v_data = ip;
              vp->v_bufobj.bo_bsize = fs->fs_bsize;
              ip->i_vnode = vp;
              ip->i_ump = ump;
              ip->i_number = ino;
              ip->i_ea_refs = 0;
              ip->i_nextclustercg = -1;
              ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2;
              ip->i_mode = 0; /* ensure error cases below throw away vnode */
      #ifdef QUOTA
              {
                      int i;
                      for (i = 0; i < MAXQUOTAS; i++)
                              ip->i_dquot[i] = NODQUOT;
              }
      #endif
      
  723         if (ffs_flags & FFSV_FORCEINSMQ)
                      vp->v_vflag |= VV_FORCEINSMQ;
              error = insmntque(vp, mp);
              if (error != 0) {
                      uma_zfree(uma_inode, ip);
                      *vpp = NULL;
                      return (error);
              }
              vp->v_vflag &= ~VV_FORCEINSMQ;
              error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
              if (error != 0)
                      return (error);
              if (*vpp != NULL) {
                      /*
                       * Calls from ffs_valloc() (i.e. FFSV_REPLACE set)
                       * operate on empty inode, which must not be found by
                       * other threads until fully filled.  Vnode for empty
                       * inode must be not re-inserted on the hash by other
                       * thread, after removal by us at the beginning.
                       */
                      MPASS((ffs_flags & FFSV_REPLACE) == 0);
                      return (0);
              }
      
              /* Read in the disk contents for the inode, copy into the inode. */
  723         error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
                  (int)fs->fs_bsize, NOCRED, &bp);
              if (error) {
                      /*
                       * The inode does not contain anything useful, so it would
                       * be misleading to leave it on its hash chain. With mode
                       * still zero, it will be unlinked and returned to the free
                       * list by vput().
                       */
                      vgone(vp);
                      vput(vp);
                      *vpp = NULL;
                      return (error);
              }
              if (I_IS_UFS1(ip))
                      ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
              else
                      ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
              if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) {
                      bqrelse(bp);
                      vgone(vp);
                      vput(vp);
                      *vpp = NULL;
                      return (error);
              }
              if (DOINGSOFTDEP(vp))
  723                 softdep_load_inodeblock(ip);
              else
                      ip->i_effnlink = ip->i_nlink;
              bqrelse(bp);
      
              /*
               * Initialize the vnode from the inode, check for aliases.
               * Note that the underlying vnode may have changed.
               */
              error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2,
                  &vp);
              if (error) {
                      vgone(vp);
                      vput(vp);
                      *vpp = NULL;
                      return (error);
              }
      
              /*
               * Finish inode initialization.
               */
  144         if (vp->v_type != VFIFO) {
                      /* FFS supports shared locking for all files except fifos. */
  723                 VN_LOCK_ASHARE(vp);
              }
      
              /*
               * Set up a generation number for this inode if it does not
               * already have one. This should only happen on old filesystems.
               */
  723         if (ip->i_gen == 0) {
                      while (ip->i_gen == 0)
                              ip->i_gen = arc4random();
                      if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
                              UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
                              DIP_SET(ip, i_gen, ip->i_gen);
                      }
              }
      #ifdef MAC
  723         if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
                      /*
                       * If this vnode is already allocated, and we're running
                       * multi-label, attempt to perform a label association
                       * from the extended attributes on the inode.
                       */
                      error = mac_vnode_associate_extattr(mp, vp);
                      if (error) {
                              /* ufs_inactive will release ip->i_devvp ref. */
                              vgone(vp);
                              vput(vp);
                              *vpp = NULL;
                              return (error);
                      }
              }
      #endif
      
              *vpp = vp;
              return (0);
      }
      
      /*
       * File handle to vnode
       *
       * Have to be really careful about stale file handles:
       * - check that the inode number is valid
       * - for UFS2 check that the inode number is initialized
       * - call ffs_vget() to get the locked inode
       * - check for an unallocated inode (i_mode == 0)
       * - check that the given client host has export rights and return
       *   those rights via. exflagsp and credanonp
       */
      static int
      ffs_fhtovp(mp, fhp, flags, vpp)
              struct mount *mp;
              struct fid *fhp;
              int flags;
              struct vnode **vpp;
      {
              struct ufid *ufhp;
              struct ufsmount *ump;
              struct fs *fs;
              struct cg *cgp;
              struct buf *bp;
              ino_t ino;
              u_int cg;
              int error;
      
              ufhp = (struct ufid *)fhp;
              ino = ufhp->ufid_ino;
              ump = VFSTOUFS(mp);
              fs = ump->um_fs;
              if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg)
                      return (ESTALE);
              /*
               * Need to check if inode is initialized because UFS2 does lazy
               * initialization and nfs_fhtovp can offer arbitrary inode numbers.
               */
              if (fs->fs_magic != FS_UFS2_MAGIC)
                      return (ufs_fhtovp(mp, ufhp, flags, vpp));
              cg = ino_to_cg(fs, ino);
              if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0)
                      return (error);
              if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) {
                      brelse(bp);
                      return (ESTALE);
              }
              brelse(bp);
              return (ufs_fhtovp(mp, ufhp, flags, vpp));
      }
      
      /*
       * Initialize the filesystem.
       */
      static int
      ffs_init(vfsp)
              struct vfsconf *vfsp;
      {
      
              ffs_susp_initialize();
              softdep_initialize();
              return (ufs_init(vfsp));
      }
      
      /*
       * Undo the work of ffs_init().
       */
      static int
      ffs_uninit(vfsp)
              struct vfsconf *vfsp;
      {
              int ret;
      
              ret = ufs_uninit(vfsp);
              softdep_uninitialize();
              ffs_susp_uninitialize();
              return (ret);
      }
      
      /*
       * Structure used to pass information from ffs_sbupdate to its
       * helper routine ffs_use_bwrite.
       */
      struct devfd {
              struct ufsmount        *ump;
              struct buf        *sbbp;
              int                 waitfor;
              int                 suspended;
              int                 error;
      };
      
      /*
       * Write a superblock and associated information back to disk.
       */
      int
      ffs_sbupdate(ump, waitfor, suspended)
              struct ufsmount *ump;
              int waitfor;
              int suspended;
      {
              struct fs *fs;
              struct buf *sbbp;
              struct devfd devfd;
      
              fs = ump->um_fs;
  622         if (fs->fs_ronly == 1 &&
                  (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
                  (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0)
                      panic("ffs_sbupdate: write read-only filesystem");
              /*
               * We use the superblock's buf to serialize calls to ffs_sbupdate().
               */
              sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
                  (int)fs->fs_sbsize, 0, 0, 0);
              /*
               * Initialize info needed for write function.
               */
              devfd.ump = ump;
              devfd.sbbp = sbbp;
              devfd.waitfor = waitfor;
              devfd.suspended = suspended;
              devfd.error = 0;
              return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite));
      }
      
      /*
       * Write function for use by filesystem-layer routines.
       */
      static int
      ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size)
  622 {
              struct devfd *devfdp;
              struct ufsmount *ump;
              struct buf *bp;
              struct fs *fs;
              int error;
      
              devfdp = devfd;
              ump = devfdp->ump;
              fs = ump->um_fs;
              /*
               * Writing the superblock summary information.
               */
              if (loc != fs->fs_sblockloc) {
                      bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0);
                      bcopy(buf, bp->b_data, (u_int)size);
  622                 if (devfdp->suspended)
                              bp->b_flags |= B_VALIDSUSPWRT;
                      if (devfdp->waitfor != MNT_WAIT)
  622                         bawrite(bp);
                      else if ((error = bwrite(bp)) != 0)
                              devfdp->error = error;
                      return (0);
              }
              /*
               * Writing the superblock itself. We need to do special checks for it.
               */
              bp = devfdp->sbbp;
              if (devfdp->error != 0) {
                      brelse(bp);
                      return (devfdp->error);
              }
  622         if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
                  (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
                      printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
                          fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
                      fs->fs_sblockloc = SBLOCK_UFS1;
              }
  622         if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
                  (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
                      printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
                          fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
                      fs->fs_sblockloc = SBLOCK_UFS2;
              }
              if (MOUNTEDSOFTDEP(ump->um_mountp))
  622                 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp);
              bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
              fs = (struct fs *)bp->b_data;
  622         ffs_oldfscompat_write(fs, ump);
              /*
               * Because we may have made changes to the superblock, we need to
               * recompute its check-hash.
               */
              fs->fs_ckhash = ffs_calc_sbhash(fs);
  622         if (devfdp->suspended)
                      bp->b_flags |= B_VALIDSUSPWRT;
              if (devfdp->waitfor != MNT_WAIT)
  622                 bawrite(bp);
              else if ((error = bwrite(bp)) != 0)
                      devfdp->error = error;
              return (devfdp->error);
      }
      
      static int
      ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
              int attrnamespace, const char *attrname)
      {
      
      #ifdef UFS_EXTATTR
              return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
                  attrname));
      #else
              return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
                  attrname));
      #endif
      }
      
      static void
      ffs_ifree(struct ufsmount *ump, struct inode *ip)
  164 {
      
              if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
                      uma_zfree(uma_ufs1, ip->i_din1);
              else if (ip->i_din2 != NULL)
  164                 uma_zfree(uma_ufs2, ip->i_din2);
              uma_zfree(uma_inode, ip);
      }
      
      static int dobkgrdwrite = 1;
      SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
          "Do background writes (honoring the BV_BKGRDWRITE flag)?");
      
      /*
       * Complete a background write started from bwrite.
       */
      static void
      ffs_backgroundwritedone(struct buf *bp)
      {
              struct bufobj *bufobj;
              struct buf *origbp;
      
              /*
               * Find the original buffer that we are writing.
               */
              bufobj = bp->b_bufobj;
              BO_LOCK(bufobj);
              if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
                      panic("backgroundwritedone: lost buffer");
      
              /*
               * We should mark the cylinder group buffer origbp as
               * dirty, to not loose the failed write.
               */
              if ((bp->b_ioflags & BIO_ERROR) != 0)
                      origbp->b_vflags |= BV_BKGRDERR;
              BO_UNLOCK(bufobj);
              /*
               * Process dependencies then return any unfinished ones.
               */
              if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0)
                      buf_complete(bp);
      #ifdef SOFTUPDATES
              if (!LIST_EMPTY(&bp->b_dep))
                      softdep_move_dependencies(bp, origbp);
      #endif
              /*
               * This buffer is marked B_NOCACHE so when it is released
               * by biodone it will be tossed.
               */
              bp->b_flags |= B_NOCACHE;
              bp->b_flags &= ~B_CACHE;
              pbrelvp(bp);
      
              /*
               * Prevent brelse() from trying to keep and re-dirtying bp on
               * errors. It causes b_bufobj dereference in
               * bdirty()/reassignbuf(), and b_bufobj was cleared in
               * pbrelvp() above.
               */
              if ((bp->b_ioflags & BIO_ERROR) != 0)
                      bp->b_flags |= B_INVAL;
              bufdone(bp);
              BO_LOCK(bufobj);
              /*
               * Clear the BV_BKGRDINPROG flag in the original buffer
               * and awaken it if it is waiting for the write to complete.
               * If BV_BKGRDINPROG is not set in the original buffer it must
               * have been released and re-instantiated - which is not legal.
               */
              KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
                  ("backgroundwritedone: lost buffer2"));
              origbp->b_vflags &= ~BV_BKGRDINPROG;
              if (origbp->b_vflags & BV_BKGRDWAIT) {
                      origbp->b_vflags &= ~BV_BKGRDWAIT;
                      wakeup(&origbp->b_xflags);
              }
              BO_UNLOCK(bufobj);
      }
      
      
      /*
       * Write, release buffer on completion.  (Done by iodone
       * if async).  Do not bother writing anything if the buffer
       * is invalid.
       *
       * Note that we set B_CACHE here, indicating that buffer is
       * fully valid and thus cacheable.  This is true even of NFS
       * now so we set it generally.  This could be set either here
       * or in biodone() since the I/O is synchronous.  We put it
       * here.
       */
      static int
      ffs_bufwrite(struct buf *bp)
 3514 {
              struct buf *newbp;
              struct cg *cgp;
      
              CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
              if (bp->b_flags & B_INVAL) {
                      brelse(bp);
                      return (0);
              }
      
              if (!BUF_ISLOCKED(bp))
                      panic("bufwrite: buffer is not busy???");
              /*
               * If a background write is already in progress, delay
               * writing this block if it is asynchronous. Otherwise
               * wait for the background write to complete.
               */
              BO_LOCK(bp->b_bufobj);
 3514         if (bp->b_vflags & BV_BKGRDINPROG) {
                      if (bp->b_flags & B_ASYNC) {
    2                         BO_UNLOCK(bp->b_bufobj);
                              bdwrite(bp);
                              return (0);
                      }
                      bp->b_vflags |= BV_BKGRDWAIT;
                      msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO,
                          "bwrbg", 0);
                      if (bp->b_vflags & BV_BKGRDINPROG)
                              panic("bufwrite: still writing");
              }
              bp->b_vflags &= ~BV_BKGRDERR;
              BO_UNLOCK(bp->b_bufobj);
      
              /*
               * If this buffer is marked for background writing and we
               * do not have to wait for it, make a copy and write the
               * copy so as to leave this buffer ready for further use.
               *
               * This optimization eats a lot of memory.  If we have a page
               * or buffer shortfall we can't do it.
               */
 3501         if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
 2790             (bp->b_flags & B_ASYNC) &&
                  !vm_page_count_severe() &&
                  !buf_dirty_count_severe()) {
                      KASSERT(bp->b_iodone == NULL,
                          ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
      
                      /* get a new block */
                      newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
                      if (newbp == NULL)
                              goto normal_write;
      
                      KASSERT(buf_mapped(bp), ("Unmapped cg"));
                      memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
                      BO_LOCK(bp->b_bufobj);
                      bp->b_vflags |= BV_BKGRDINPROG;
                      BO_UNLOCK(bp->b_bufobj);
                      newbp->b_xflags |=
                          (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER;
                      newbp->b_lblkno = bp->b_lblkno;
                      newbp->b_blkno = bp->b_blkno;
                      newbp->b_offset = bp->b_offset;
                      newbp->b_iodone = ffs_backgroundwritedone;
                      newbp->b_flags |= B_ASYNC;
                      newbp->b_flags &= ~B_INVAL;
                      pbgetvp(bp->b_vp, newbp);
      
      #ifdef SOFTUPDATES
                      /*
                       * Move over the dependencies.  If there are rollbacks,
                       * leave the parent buffer dirtied as it will need to
                       * be written again.
                       */
  805                 if (LIST_EMPTY(&bp->b_dep) ||
                          softdep_move_dependencies(bp, newbp) == 0)
                              bundirty(bp);
      #else
                      bundirty(bp);
      #endif
      
                      /*
                       * Initiate write on the copy, release the original.  The
                       * BKGRDINPROG flag prevents it from going away until 
                       * the background write completes. We have to recalculate
                       * its check hash in case the buffer gets freed and then
                       * reconstituted from the buffer cache during a later read.
                       */
                      if ((bp->b_xflags & BX_CYLGRP) != 0) {
  805                         cgp = (struct cg *)bp->b_data;
                              cgp->cg_ckhash = 0;
                              cgp->cg_ckhash =
                                  calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
                      }
                      bqrelse(bp);
                      bp = newbp;
              } else
                      /* Mark the buffer clean */
                      bundirty(bp);
      
      
              /* Let the normal bufwrite do the rest for us */
      normal_write:
              /*
               * If we are writing a cylinder group, update its time.
               */
 3501         if ((bp->b_xflags & BX_CYLGRP) != 0) {
 3477                 cgp = (struct cg *)bp->b_data;
                      cgp->cg_old_time = cgp->cg_time = time_second;
              }
              return (bufwrite(bp));
      }
      
      
      static void
      ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 3719 {
              struct vnode *vp;
              struct buf *tbp;
              int error, nocopy;
      
              /*
               * This is the bufobj strategy for the private VCHR vnodes
               * used by FFS to access the underlying storage device.
               * We override the default bufobj strategy and thus bypass
               * VOP_STRATEGY() for these vnodes.
               */
              vp = bo2vnode(bo);
 3719         KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR ||
                  bp->b_vp->v_rdev == NULL ||
                  bp->b_vp->v_rdev->si_mountpt == NULL ||
                  VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL ||
                  vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp,
                  ("ffs_geom_strategy() with wrong vp"));
   59         if (bp->b_iocmd == BIO_WRITE) {
 3691                 if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
 3514                     bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
                          (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
                              panic("ffs_geom_strategy: bad I/O");
                      nocopy = bp->b_flags & B_NOCOPY;
                      bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);
 3696                 if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
                          vp->v_rdev->si_snapdata != NULL) {
                              if ((bp->b_flags & B_CLUSTER) != 0) {
                                      runningbufwakeup(bp);
                                      TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
                                                    b_cluster.cluster_entry) {
                                              error = ffs_copyonwrite(vp, tbp);
                                              if (error != 0 &&
                                                  error != EOPNOTSUPP) {
                                                      bp->b_error = error;
                                                      bp->b_ioflags |= BIO_ERROR;
                                                      bufdone(bp);
                                                      return;
                                              }
                                      }
                                      bp->b_runningbufspace = bp->b_bufsize;
                                      atomic_add_long(&runningbufspace,
                                                     bp->b_runningbufspace);
                              } else {
                                      error = ffs_copyonwrite(vp, bp);
                                      if (error != 0 && error != EOPNOTSUPP) {
                                              bp->b_error = error;
                                              bp->b_ioflags |= BIO_ERROR;
                                              bufdone(bp);
                                              return;
                                      }
                              }
                      }
      #ifdef SOFTUPDATES
                      if ((bp->b_flags & B_CLUSTER) != 0) {
  591                         TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
                                            b_cluster.cluster_entry) {
  537                                 if (!LIST_EMPTY(&tbp->b_dep))
                                              buf_start(tbp);
                              }
                      } else {
 1045                         if (!LIST_EMPTY(&bp->b_dep))
                                      buf_start(bp);
                      }
      
      #endif
                      /*
                       * Check for metadata that needs check-hashes and update them.
                       */
 3696                 switch (bp->b_xflags & BX_FSPRIV) {
                      case BX_CYLGRP:
 3477                         ((struct cg *)bp->b_data)->cg_ckhash = 0;
                              ((struct cg *)bp->b_data)->cg_ckhash =
                                  calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
                              break;
      
                      case BX_SUPERBLOCK:
                      case BX_INODE:
                      case BX_INDIR:
                      case BX_DIR:
                              printf("Check-hash write is unimplemented!!!\n");
                              break;
      
                      case 0:
                              break;
      
                      default:
                              printf("multiple buffer types 0x%b\n",
                                  (u_int)(bp->b_xflags & BX_FSPRIV),
                                  PRINT_UFS_BUF_XFLAGS);
                              break;
                      }
              }
              g_vfs_strategy(bo, bp);
      }
      
      int
      ffs_own_mount(const struct mount *mp)
      {
      
              if (mp->mnt_op == &ufs_vfsops)
                      return (1);
              return (0);
      }
      
      #ifdef        DDB
      #ifdef SOFTUPDATES
      
      /* defined in ffs_softdep.c */
      extern void db_print_ffs(struct ufsmount *ump);
      
      DB_SHOW_COMMAND(ffs, db_show_ffs)
      {
              struct mount *mp;
              struct ufsmount *ump;
      
              if (have_addr) {
                      ump = VFSTOUFS((struct mount *)addr);
                      db_print_ffs(ump);
                      return;
              }
      
              TAILQ_FOREACH(mp, &mountlist, mnt_list) {
                      if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name))
                              db_print_ffs(VFSTOUFS(mp));
              }
      }
      
      #endif        /* SOFTUPDATES */
      #endif        /* DDB */
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 2009 Bruce Simpson.
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. The name of the author may not be used to endorse or promote
       *    products derived from this software without specific prior written
       *    permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      /*
       * IPv6 multicast socket, group, and socket option processing module.
       * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_inet6.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/ktr.h>
      #include <sys/malloc.h>
      #include <sys/mbuf.h>
      #include <sys/protosw.h>
      #include <sys/socket.h>
      #include <sys/socketvar.h>
      #include <sys/sysctl.h>
      #include <sys/priv.h>
      #include <sys/taskqueue.h>
      #include <sys/tree.h>
      
      #include <net/if.h>
      #include <net/if_var.h>
      #include <net/if_dl.h>
      #include <net/route.h>
      #include <net/vnet.h>
      
      #include <netinet/in.h>
      #include <netinet/udp.h>
      #include <netinet/in_var.h>
      #include <netinet/ip_var.h>
      #include <netinet/udp_var.h>
      #include <netinet6/in6_fib.h>
      #include <netinet6/in6_var.h>
      #include <netinet/ip6.h>
      #include <netinet/icmp6.h>
      #include <netinet6/ip6_var.h>
      #include <netinet/in_pcb.h>
      #include <netinet/tcp_var.h>
      #include <netinet6/nd6.h>
      #include <netinet6/mld6_var.h>
      #include <netinet6/scope6_var.h>
      
      #ifndef KTR_MLD
      #define KTR_MLD KTR_INET6
      #endif
      
      #ifndef __SOCKUNION_DECLARED
      union sockunion {
              struct sockaddr_storage        ss;
              struct sockaddr                sa;
              struct sockaddr_dl        sdl;
              struct sockaddr_in6        sin6;
      };
      typedef union sockunion sockunion_t;
      #define __SOCKUNION_DECLARED
      #endif /* __SOCKUNION_DECLARED */
      
      static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
          "IPv6 multicast PCB-layer source filter");
      MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
      static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
      static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
          "IPv6 multicast MLD-layer source filter");
      
   46 RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
      
      /*
       * Locking:
       * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK,
       *   IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK.
       * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
       *   it can be taken by code in net/if.c also.
       * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
       *
       * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
       * any need for in6_multi itself to be virtualized -- it is bound to an ifp
       * anyway no matter what happens.
       */
      struct mtx in6_multi_list_mtx;
      MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF);
      
      struct mtx in6_multi_free_mtx;
      MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF);
      
      struct sx in6_multi_sx;
      SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx");
      
      static void        im6f_commit(struct in6_mfilter *);
      static int        im6f_get_source(struct in6_mfilter *imf,
                          const struct sockaddr_in6 *psin,
                          struct in6_msource **);
      static struct in6_msource *
                      im6f_graft(struct in6_mfilter *, const uint8_t,
                          const struct sockaddr_in6 *);
      static void        im6f_leave(struct in6_mfilter *);
      static int        im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *);
      static void        im6f_purge(struct in6_mfilter *);
      static void        im6f_rollback(struct in6_mfilter *);
      static void        im6f_reap(struct in6_mfilter *);
      static struct in6_mfilter *
                      im6o_match_group(const struct ip6_moptions *,
                          const struct ifnet *, const struct sockaddr *);
      static struct in6_msource *
                      im6o_match_source(struct in6_mfilter *, const struct sockaddr *);
      static void        im6s_merge(struct ip6_msource *ims,
                          const struct in6_msource *lims, const int rollback);
      static int        in6_getmulti(struct ifnet *, const struct in6_addr *,
                          struct in6_multi **);
      static int        in6_joingroup_locked(struct ifnet *, const struct in6_addr *,
                          struct in6_mfilter *, struct in6_multi **, int);
      static int        in6m_get_source(struct in6_multi *inm,
                          const struct in6_addr *addr, const int noalloc,
                          struct ip6_msource **pims);
      #ifdef KTR
      static int        in6m_is_ifp_detached(const struct in6_multi *);
      #endif
      static int        in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *);
      static void        in6m_purge(struct in6_multi *);
      static void        in6m_reap(struct in6_multi *);
      static struct ip6_moptions *
                      in6p_findmoptions(struct inpcb *);
      static int        in6p_get_source_filters(struct inpcb *, struct sockopt *);
      static int        in6p_join_group(struct inpcb *, struct sockopt *);
      static int        in6p_leave_group(struct inpcb *, struct sockopt *);
      static struct ifnet *
                      in6p_lookup_mcast_ifp(const struct inpcb *,
                          const struct sockaddr_in6 *);
      static int        in6p_block_unblock_source(struct inpcb *, struct sockopt *);
      static int        in6p_set_multicast_if(struct inpcb *, struct sockopt *);
      static int        in6p_set_source_filters(struct inpcb *, struct sockopt *);
      static int        sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);
      
      SYSCTL_DECL(_net_inet6_ip6);        /* XXX Not in any common header. */
      
      static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast,
          CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "IPv6 multicast");
      
      static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
      SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
          CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
          "Max source filters per group");
      
      static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
      SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
          CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
          "Max source filters per socket");
      
      /* TODO Virtualize this switch. */
      int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
      SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
          &in6_mcast_loop, 0, "Loopback multicast datagrams by default");
      
      static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
          CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
          "Per-interface stack-wide source filters");
      
      #ifdef KTR
      /*
       * Inline function which wraps assertions for a valid ifp.
       * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
       * is detached.
       */
      static int __inline
      in6m_is_ifp_detached(const struct in6_multi *inm)
      {
              struct ifnet *ifp;
      
              KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
              ifp = inm->in6m_ifma->ifma_ifp;
              if (ifp != NULL) {
                      /*
                       * Sanity check that network-layer notion of ifp is the
                       * same as that of link-layer.
                       */
                      KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
              }
      
              return (ifp == NULL);
      }
      #endif
      
      /*
       * Initialize an in6_mfilter structure to a known state at t0, t1
       * with an empty source filter list.
       */
      static __inline void
      im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
      {
              memset(imf, 0, sizeof(struct in6_mfilter));
              RB_INIT(&imf->im6f_sources);
              imf->im6f_st[0] = st0;
              imf->im6f_st[1] = st1;
      }
      
      struct in6_mfilter *
      ip6_mfilter_alloc(const int mflags, const int st0, const int st1)
      {
              struct in6_mfilter *imf;
      
   25         imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags);
      
              if (imf != NULL)
                      im6f_init(imf, st0, st1);
      
              return (imf);
      }
      
      void
      ip6_mfilter_free(struct in6_mfilter *imf)
    5 {
      
              im6f_purge(imf);
              free(imf, M_IN6MFILTER);
      }
      
      /*
       * Find an IPv6 multicast group entry for this ip6_moptions instance
       * which matches the specified group, and optionally an interface.
       * Return its index into the array, or -1 if not found.
       */
      static struct in6_mfilter *
      im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp,
          const struct sockaddr *group)
      {
              const struct sockaddr_in6 *gsin6;
              struct in6_mfilter *imf;
              struct in6_multi *inm;
      
              gsin6 = (const struct sockaddr_in6 *)group;
      
   36         IP6_MFILTER_FOREACH(imf, &imo->im6o_head) {
                      inm = imf->im6f_in6m;
                      if (inm == NULL)
                              continue;
   23                 if ((ifp == NULL || (inm->in6m_ifp == ifp)) &&
                          IN6_ARE_ADDR_EQUAL(&inm->in6m_addr,
                          &gsin6->sin6_addr)) {
                              break;
                      }
              }
              return (imf);
      }
      
      /*
       * Find an IPv6 multicast source entry for this imo which matches
       * the given group index for this socket, and source address.
       *
       * XXX TODO: The scope ID, if present in src, is stripped before
       * any comparison. We SHOULD enforce scope/zone checks where the source
       * filter entry has a link scope.
       *
       * NOTE: This does not check if the entry is in-mode, merely if
       * it exists, which may not be the desired behaviour.
       */
      static struct in6_msource *
      im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src)
      {
              struct ip6_msource         find;
              struct ip6_msource        *ims;
              const sockunion_t        *psa;
      
              KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
      
              psa = (const sockunion_t *)src;
              find.im6s_addr = psa->sin6.sin6_addr;
              in6_clearscope(&find.im6s_addr);                /* XXX */
              ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
      
              return ((struct in6_msource *)ims);
      }
      
      /*
       * Perform filtering for multicast datagrams on a socket by group and source.
       *
       * Returns 0 if a datagram should be allowed through, or various error codes
       * if the socket was not a member of the group, or the source was muted, etc.
       */
      int
      im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp,
          const struct sockaddr *group, const struct sockaddr *src)
    1 {
              struct in6_mfilter *imf;
              struct in6_msource *ims;
              int mode;
      
              KASSERT(ifp != NULL, ("%s: null ifp", __func__));
      
    1         imf = im6o_match_group(imo, ifp, group);
              if (imf == NULL)
                      return (MCAST_NOTGMEMBER);
      
              /*
               * Check if the source was included in an (S,G) join.
               * Allow reception on exclusive memberships by default,
               * reject reception on inclusive memberships by default.
               * Exclude source only if an in-mode exclude filter exists.
               * Include source only if an in-mode include filter exists.
               * NOTE: We are comparing group state here at MLD t1 (now)
               * with socket-layer t0 (since last downcall).
               */
              mode = imf->im6f_st[1];
              ims = im6o_match_source(imf, src);
      
              if ((ims == NULL && mode == MCAST_INCLUDE) ||
                  (ims != NULL && ims->im6sl_st[0] != mode))
                      return (MCAST_NOTSMEMBER);
      
              return (MCAST_PASS);
      }
      
      /*
       * Find and return a reference to an in6_multi record for (ifp, group),
       * and bump its reference count.
       * If one does not exist, try to allocate it, and update link-layer multicast
       * filters on ifp to listen for group.
       * Assumes the IN6_MULTI lock is held across the call.
       * Return 0 if successful, otherwise return an appropriate error code.
       */
      static int
      in6_getmulti(struct ifnet *ifp, const struct in6_addr *group,
          struct in6_multi **pinm)
      {
              struct epoch_tracker         et;
              struct sockaddr_in6         gsin6;
              struct ifmultiaddr        *ifma;
              struct in6_multi        *inm;
              int                         error;
      
              error = 0;
      
              /*
               * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
               * if_addmulti() takes this mutex itself, so we must drop and
               * re-acquire around the call.
               */
              IN6_MULTI_LOCK_ASSERT();
              IN6_MULTI_LIST_LOCK();
              IF_ADDR_WLOCK(ifp);
              NET_EPOCH_ENTER(et);
              inm = in6m_lookup_locked(ifp, group);
              NET_EPOCH_EXIT(et);
      
              if (inm != NULL) {
                      /*
                       * If we already joined this group, just bump the
                       * refcount and return it.
                       */
                      KASSERT(inm->in6m_refcount >= 1,
                          ("%s: bad refcount %d", __func__, inm->in6m_refcount));
   22                 in6m_acquire_locked(inm);
                      *pinm = inm;
                      goto out_locked;
              }
      
              memset(&gsin6, 0, sizeof(gsin6));
              gsin6.sin6_family = AF_INET6;
              gsin6.sin6_len = sizeof(struct sockaddr_in6);
              gsin6.sin6_addr = *group;
      
              /*
               * Check if a link-layer group is already associated
               * with this network-layer group on the given ifnet.
               */
              IN6_MULTI_LIST_UNLOCK();
              IF_ADDR_WUNLOCK(ifp);
              error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
              if (error != 0)
                      return (error);
              IN6_MULTI_LIST_LOCK();
              IF_ADDR_WLOCK(ifp);
      
              /*
               * If something other than netinet6 is occupying the link-layer
               * group, print a meaningful error message and back out of
               * the allocation.
               * Otherwise, bump the refcount on the existing network-layer
               * group association and return it.
               */
              if (ifma->ifma_protospec != NULL) {
                      inm = (struct in6_multi *)ifma->ifma_protospec;
      #ifdef INVARIANTS
                      KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
                          __func__));
                      KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
                          ("%s: ifma not AF_INET6", __func__));
                      KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
                      if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp ||
                          !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
                              panic("%s: ifma %p is inconsistent with %p (%p)",
                                  __func__, ifma, inm, group);
      #endif
                      in6m_acquire_locked(inm);
                      *pinm = inm;
                      goto out_locked;
              }
      
              IF_ADDR_WLOCK_ASSERT(ifp);
      
              /*
               * A new in6_multi record is needed; allocate and initialize it.
               * We DO NOT perform an MLD join as the in6_ layer may need to
               * push an initial source list down to MLD to support SSM.
               *
               * The initial source filter state is INCLUDE, {} as per the RFC.
               * Pending state-changes per group are subject to a bounds check.
               */
    4         inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO);
              if (inm == NULL) {
                      IN6_MULTI_LIST_UNLOCK();
                      IF_ADDR_WUNLOCK(ifp);
                      if_delmulti_ifma(ifma);
                      return (ENOMEM);
              }
              inm->in6m_addr = *group;
              inm->in6m_ifp = ifp;
              inm->in6m_mli = MLD_IFINFO(ifp);
              inm->in6m_ifma = ifma;
              inm->in6m_refcount = 1;
              inm->in6m_state = MLD_NOT_MEMBER;
              mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);
      
              inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
              inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
              RB_INIT(&inm->in6m_srcs);
      
              ifma->ifma_protospec = inm;
              *pinm = inm;
      
       out_locked:
              IN6_MULTI_LIST_UNLOCK();
              IF_ADDR_WUNLOCK(ifp);
              return (error);
      }
      
      /*
       * Drop a reference to an in6_multi record.
       *
       * If the refcount drops to 0, free the in6_multi record and
       * delete the underlying link-layer membership.
       */
      static void
      in6m_release(struct in6_multi *inm)
      {
              struct ifmultiaddr *ifma;
              struct ifnet *ifp;
      
              CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);
      
              MPASS(inm->in6m_refcount == 0);
              CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);
      
              ifma = inm->in6m_ifma;
              ifp = inm->in6m_ifp;
              MPASS(ifma->ifma_llifma == NULL);
      
              /* XXX this access is not covered by IF_ADDR_LOCK */
              CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
              KASSERT(ifma->ifma_protospec == NULL,
                  ("%s: ifma_protospec != NULL", __func__));
              if (ifp == NULL)
                      ifp = ifma->ifma_ifp;
      
              if (ifp != NULL) {
                      CURVNET_SET(ifp->if_vnet);
                      in6m_purge(inm);
                      free(inm, M_IP6MADDR);
                      if_delmulti_ifma_flags(ifma, 1);
                      CURVNET_RESTORE();
                      if_rele(ifp);
              } else {
                      in6m_purge(inm);
                      free(inm, M_IP6MADDR);
                      if_delmulti_ifma_flags(ifma, 1);
              }
      }
      
      /*
       * Interface detach can happen in a taskqueue thread context, so we must use a
       * dedicated thread to avoid deadlocks when draining in6m_release tasks.
       */
      TASKQUEUE_DEFINE_THREAD(in6m_free);
      static struct task in6m_free_task;
      static struct in6_multi_head in6m_free_list = SLIST_HEAD_INITIALIZER();
      static void in6m_release_task(void *arg __unused, int pending __unused);
      
      static void
      in6m_init(void)
      {
              TASK_INIT(&in6m_free_task, 0, in6m_release_task, NULL);
      }
      SYSINIT(in6m_init, SI_SUB_TASKQ, SI_ORDER_ANY, in6m_init, NULL);
      
      void
      in6m_release_list_deferred(struct in6_multi_head *inmh)
      {
   29         if (SLIST_EMPTY(inmh))
                      return;
              mtx_lock(&in6_multi_free_mtx);
              SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele);
              mtx_unlock(&in6_multi_free_mtx);
              taskqueue_enqueue(taskqueue_in6m_free, &in6m_free_task);
      }
      
      void
      in6m_release_wait(void)
      {
              taskqueue_drain_all(taskqueue_in6m_free);
      }
      
      void
      in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm)
      {
              struct ifnet *ifp;
              struct ifaddr *ifa;
              struct in6_ifaddr *ifa6;
              struct in6_multi_mship *imm, *imm_tmp;
              struct ifmultiaddr *ifma, *ll_ifma;
      
              IN6_MULTI_LIST_LOCK_ASSERT();
      
              ifp = inm->in6m_ifp;
              if (ifp == NULL)
                      return;                /* already called */
      
              inm->in6m_ifp = NULL;
              IF_ADDR_WLOCK_ASSERT(ifp);
              ifma = inm->in6m_ifma;
              if (ifma == NULL)
                      return;
      
              if_ref(ifp);
              if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
                      CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
                      ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
              }
              MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
              if ((ll_ifma = ifma->ifma_llifma) != NULL) {
                      MPASS(ifma != ll_ifma);
                      ifma->ifma_llifma = NULL;
                      MPASS(ll_ifma->ifma_llifma == NULL);
                      MPASS(ll_ifma->ifma_ifp == ifp);
                      if (--ll_ifma->ifma_refcount == 0) {
                              if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
                                      CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
                                      ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
                              }
                              MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
                              if_freemulti(ll_ifma);
                      }
              }
              CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
                      if (ifa->ifa_addr->sa_family != AF_INET6)
                              continue;
                      ifa6 = (void *)ifa;
                      LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships,
                          i6mm_chain, imm_tmp) {
                              if (inm == imm->i6mm_maddr) {
                                      LIST_REMOVE(imm, i6mm_chain);
                                      free(imm, M_IP6MADDR);
                                      in6m_rele_locked(inmh, inm);
                              }
                      }
              }
      }
      
      static void
      in6m_release_task(void *arg __unused, int pending __unused)
      {
              struct in6_multi_head in6m_free_tmp;
              struct in6_multi *inm, *tinm;
      
              SLIST_INIT(&in6m_free_tmp);
              mtx_lock(&in6_multi_free_mtx);
              SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele);
              mtx_unlock(&in6_multi_free_mtx);
              IN6_MULTI_LOCK();
              SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) {
                      SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele);
                      in6m_release(inm);
              }
              IN6_MULTI_UNLOCK();
      }
      
      /*
       * Clear recorded source entries for a group.
       * Used by the MLD code. Caller must hold the IN6_MULTI lock.
       * FIXME: Should reap.
       */
      void
      in6m_clear_recorded(struct in6_multi *inm)
      {
              struct ip6_msource        *ims;
      
              IN6_MULTI_LIST_LOCK_ASSERT();
      
              RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
                      if (ims->im6s_stp) {
                              ims->im6s_stp = 0;
                              --inm->in6m_st[1].iss_rec;
                      }
              }
              KASSERT(inm->in6m_st[1].iss_rec == 0,
                  ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
      }
      
      /*
       * Record a source as pending for a Source-Group MLDv2 query.
       * This lives here as it modifies the shared tree.
       *
       * inm is the group descriptor.
       * naddr is the address of the source to record in network-byte order.
       *
       * If the net.inet6.mld.sgalloc sysctl is non-zero, we will
       * lazy-allocate a source node in response to an SG query.
       * Otherwise, no allocation is performed. This saves some memory
       * with the trade-off that the source will not be reported to the
       * router if joined in the window between the query response and
       * the group actually being joined on the local host.
       *
       * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
       * This turns off the allocation of a recorded source entry if
       * the group has not been joined.
       *
       * Return 0 if the source didn't exist or was already marked as recorded.
       * Return 1 if the source was marked as recorded by this function.
       * Return <0 if any error occurred (negated errno code).
       */
      int
      in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr)
      {
              struct ip6_msource         find;
              struct ip6_msource        *ims, *nims;
      
              IN6_MULTI_LIST_LOCK_ASSERT();
      
              find.im6s_addr = *addr;
              ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
              if (ims && ims->im6s_stp)
                      return (0);
              if (ims == NULL) {
                      if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
                              return (-ENOSPC);
                      nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
                          M_NOWAIT | M_ZERO);
                      if (nims == NULL)
                              return (-ENOMEM);
                      nims->im6s_addr = find.im6s_addr;
                      RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
                      ++inm->in6m_nsrc;
                      ims = nims;
              }
      
              /*
               * Mark the source as recorded and update the recorded
               * source count.
               */
              ++ims->im6s_stp;
              ++inm->in6m_st[1].iss_rec;
      
              return (1);
      }
      
      /*
       * Return a pointer to an in6_msource owned by an in6_mfilter,
       * given its source address.
       * Lazy-allocate if needed. If this is a new entry its filter state is
       * undefined at t0.
       *
       * imf is the filter set being modified.
       * addr is the source address.
       *
       * SMPng: May be called with locks held; malloc must not block.
       */
      static int
      im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin,
          struct in6_msource **plims)
      {
              struct ip6_msource         find;
              struct ip6_msource        *ims, *nims;
              struct in6_msource        *lims;
              int                         error;
      
              error = 0;
              ims = NULL;
              lims = NULL;
      
              find.im6s_addr = psin->sin6_addr;
              ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
              lims = (struct in6_msource *)ims;
              if (lims == NULL) {
                      if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
                              return (ENOSPC);
                      nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
                          M_NOWAIT | M_ZERO);
                      if (nims == NULL)
                              return (ENOMEM);
                      lims = (struct in6_msource *)nims;
                      lims->im6s_addr = find.im6s_addr;
                      lims->im6sl_st[0] = MCAST_UNDEFINED;
                      RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
                      ++imf->im6f_nsrc;
              }
      
              *plims = lims;
      
              return (error);
      }
      
      /*
       * Graft a source entry into an existing socket-layer filter set,
       * maintaining any required invariants and checking allocations.
       *
       * The source is marked as being in the new filter mode at t1.
       *
       * Return the pointer to the new node, otherwise return NULL.
       */
      static struct in6_msource *
      im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
          const struct sockaddr_in6 *psin)
      {
              struct ip6_msource        *nims;
              struct in6_msource        *lims;
      
              nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
                  M_NOWAIT | M_ZERO);
              if (nims == NULL)
                      return (NULL);
              lims = (struct in6_msource *)nims;
              lims->im6s_addr = psin->sin6_addr;
              lims->im6sl_st[0] = MCAST_UNDEFINED;
              lims->im6sl_st[1] = st1;
              RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
              ++imf->im6f_nsrc;
      
              return (lims);
      }
      
      /*
       * Prune a source entry from an existing socket-layer filter set,
       * maintaining any required invariants and checking allocations.
       *
       * The source is marked as being left at t1, it is not freed.
       *
       * Return 0 if no error occurred, otherwise return an errno value.
       */
      static int
      im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin)
      {
              struct ip6_msource         find;
              struct ip6_msource        *ims;
              struct in6_msource        *lims;
      
              find.im6s_addr = psin->sin6_addr;
              ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
              if (ims == NULL)
                      return (ENOENT);
              lims = (struct in6_msource *)ims;
              lims->im6sl_st[1] = MCAST_UNDEFINED;
              return (0);
      }
      
      /*
       * Revert socket-layer filter set deltas at t1 to t0 state.
       */
      static void
      im6f_rollback(struct in6_mfilter *imf)
      {
              struct ip6_msource        *ims, *tims;
              struct in6_msource        *lims;
      
              RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
                      lims = (struct in6_msource *)ims;
                      if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
                              /* no change at t1 */
                              continue;
                      } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
                              /* revert change to existing source at t1 */
                              lims->im6sl_st[1] = lims->im6sl_st[0];
                      } else {
                              /* revert source added t1 */
                              CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
                              RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
                              free(ims, M_IN6MFILTER);
                              imf->im6f_nsrc--;
                      }
              }
              imf->im6f_st[1] = imf->im6f_st[0];
      }
      
      /*
       * Mark socket-layer filter set as INCLUDE {} at t1.
       */
      static void
      im6f_leave(struct in6_mfilter *imf)
      {
              struct ip6_msource        *ims;
              struct in6_msource        *lims;
      
              RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
                      lims = (struct in6_msource *)ims;
                      lims->im6sl_st[1] = MCAST_UNDEFINED;
              }
              imf->im6f_st[1] = MCAST_INCLUDE;
      }
      
      /*
       * Mark socket-layer filter set deltas as committed.
       */
      static void
      im6f_commit(struct in6_mfilter *imf)
      {
              struct ip6_msource        *ims;
              struct in6_msource        *lims;
      
    2         RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
                      lims = (struct in6_msource *)ims;
                      lims->im6sl_st[0] = lims->im6sl_st[1];
              }
    3         imf->im6f_st[0] = imf->im6f_st[1];
      }
      
      /*
       * Reap unreferenced sources from socket-layer filter set.
       */
      static void
      im6f_reap(struct in6_mfilter *imf)
      {
              struct ip6_msource        *ims, *tims;
              struct in6_msource        *lims;
      
   16         RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
                      lims = (struct in6_msource *)ims;
   10                 if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
                          (lims->im6sl_st[1] == MCAST_UNDEFINED)) {
                              CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
    7                         RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
                              free(ims, M_IN6MFILTER);
                              imf->im6f_nsrc--;
                      }
              }
      }
      
      /*
       * Purge socket-layer filter set.
       */
      static void
      im6f_purge(struct in6_mfilter *imf)
      {
              struct ip6_msource        *ims, *tims;
      
              RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
                      CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
                      RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
                      free(ims, M_IN6MFILTER);
                      imf->im6f_nsrc--;
              }
              imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
              KASSERT(RB_EMPTY(&imf->im6f_sources),
                  ("%s: im6f_sources not empty", __func__));
      }
      
      /*
       * Look up a source filter entry for a multicast group.
       *
       * inm is the group descriptor to work with.
       * addr is the IPv6 address to look up.
       * noalloc may be non-zero to suppress allocation of sources.
       * *pims will be set to the address of the retrieved or allocated source.
       *
       * SMPng: NOTE: may be called with locks held.
       * Return 0 if successful, otherwise return a non-zero error code.
       */
      static int
      in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr,
          const int noalloc, struct ip6_msource **pims)
      {
              struct ip6_msource         find;
              struct ip6_msource        *ims, *nims;
      #ifdef KTR
              char                         ip6tbuf[INET6_ADDRSTRLEN];
      #endif
      
              find.im6s_addr = *addr;
              ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
              if (ims == NULL && !noalloc) {
                      if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
                              return (ENOSPC);
                      nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
                          M_NOWAIT | M_ZERO);
                      if (nims == NULL)
                              return (ENOMEM);
                      nims->im6s_addr = *addr;
                      RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
                      ++inm->in6m_nsrc;
                      ims = nims;
                      CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
                          ip6_sprintf(ip6tbuf, addr), ims);
              }
      
              *pims = ims;
              return (0);
      }
      
      /*
       * Merge socket-layer source into MLD-layer source.
       * If rollback is non-zero, perform the inverse of the merge.
       */
      static void
      im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims,
          const int rollback)
      {
              int n = rollback ? -1 : 1;
      #ifdef KTR
              char ip6tbuf[INET6_ADDRSTRLEN];
      
              ip6_sprintf(ip6tbuf, &lims->im6s_addr);
      #endif
      
   24         if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
                      CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
    1                 ims->im6s_st[1].ex -= n;
              } else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
                      CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
    6                 ims->im6s_st[1].in -= n;
              }
      
    7         if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
                      CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
    9                 ims->im6s_st[1].ex += n;
              } else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
                      CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
   15                 ims->im6s_st[1].in += n;
              }
      }
      
      /*
       * Atomically update the global in6_multi state, when a membership's
       * filter list is being updated in any way.
       *
       * imf is the per-inpcb-membership group filter pointer.
       * A fake imf may be passed for in-kernel consumers.
       *
       * XXX This is a candidate for a set-symmetric-difference style loop
       * which would eliminate the repeated lookup from root of ims nodes,
       * as they share the same key space.
       *
       * If any error occurred this function will back out of refcounts
       * and return a non-zero value.
       */
      static int
      in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
   42 {
              struct ip6_msource        *ims, *nims;
              struct in6_msource        *lims;
              int                         schanged, error;
              int                         nsrc0, nsrc1;
      
              schanged = 0;
              error = 0;
              nsrc1 = nsrc0 = 0;
              IN6_MULTI_LIST_LOCK_ASSERT();
      
              /*
               * Update the source filters first, as this may fail.
               * Maintain count of in-mode filters at t0, t1. These are
               * used to work out if we transition into ASM mode or not.
               * Maintain a count of source filters whose state was
               * actually modified by this operation.
               */
              RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
                      lims = (struct in6_msource *)ims;
                      if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
                      if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
   13                 if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
                      error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
                      ++schanged;
                      if (error)
                              break;
                      im6s_merge(nims, lims, 0);
              }
              if (error) {
                      struct ip6_msource *bims;
      
                      RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
                              lims = (struct in6_msource *)ims;
                              if (lims->im6sl_st[0] == lims->im6sl_st[1])
                                      continue;
                              (void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
                              if (bims == NULL)
                                      continue;
                              im6s_merge(bims, lims, 1);
                      }
                      goto out_reap;
              }
      
              CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
                  __func__, nsrc0, nsrc1);
      
              /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
   39         if (imf->im6f_st[0] == imf->im6f_st[1] &&
                  imf->im6f_st[1] == MCAST_INCLUDE) {
                      if (nsrc1 == 0) {
                              CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
    5                         --inm->in6m_st[1].iss_in;
                      }
              }
      
              /* Handle filter mode transition on socket. */
    5         if (imf->im6f_st[0] != imf->im6f_st[1]) {
                      CTR3(KTR_MLD, "%s: imf transition %d to %d",
                          __func__, imf->im6f_st[0], imf->im6f_st[1]);
      
   25                 if (imf->im6f_st[0] == MCAST_EXCLUDE) {
                              CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
    3                         --inm->in6m_st[1].iss_ex;
                      } else if (imf->im6f_st[0] == MCAST_INCLUDE) {
                              CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
                              --inm->in6m_st[1].iss_in;
                      }
      
                      if (imf->im6f_st[1] == MCAST_EXCLUDE) {
                              CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
   14                         inm->in6m_st[1].iss_ex++;
    3                 } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
                              CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
   11                         inm->in6m_st[1].iss_in++;
                      }
              }
      
              /*
               * Track inm filter state in terms of listener counts.
               * If there are any exclusive listeners, stack-wide
               * membership is exclusive.
               * Otherwise, if only inclusive listeners, stack-wide is inclusive.
               * If no listeners remain, state is undefined at t1,
               * and the MLD lifecycle for this group should finish.
               */
              if (inm->in6m_st[1].iss_ex > 0) {
                      CTR1(KTR_MLD, "%s: transition to EX", __func__);
   42                 inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
              } else if (inm->in6m_st[1].iss_in > 0) {
                      CTR1(KTR_MLD, "%s: transition to IN", __func__);
                      inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
              } else {
                      CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
                      inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
              }
      
              /* Decrement ASM listener count on transition out of ASM mode. */
   39         if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
                      if ((imf->im6f_st[1] != MCAST_EXCLUDE) ||
                          (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
                              CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
    9                         --inm->in6m_st[1].iss_asm;
                      }
              }
      
              /* Increment ASM listener count on transition to ASM mode. */
   31         if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
                      CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
   15                 inm->in6m_st[1].iss_asm++;
              }
      
              CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
              in6m_print(inm);
      
      out_reap:
   16         if (schanged > 0) {
                      CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
   29                 in6m_reap(inm);
              }
              return (error);
      }
      
      /*
       * Mark an in6_multi's filter set deltas as committed.
       * Called by MLD after a state change has been enqueued.
       */
      void
      in6m_commit(struct in6_multi *inm)
   40 {
              struct ip6_msource        *ims;
      
              CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
              CTR1(KTR_MLD, "%s: pre commit:", __func__);
              in6m_print(inm);
      
   22         RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
                      ims->im6s_st[0] = ims->im6s_st[1];
              }
              inm->in6m_st[0] = inm->in6m_st[1];
      }
      
      /*
       * Reap unreferenced nodes from an in6_multi's filter set.
       */
      static void
      in6m_reap(struct in6_multi *inm)
      {
              struct ip6_msource        *ims, *tims;
      
   29         RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
   22                 if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 ||
   24                     ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 ||
                          ims->im6s_stp != 0)
                              continue;
                      CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
    8                 RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
                      free(ims, M_IP6MSOURCE);
                      inm->in6m_nsrc--;
              }
      }
      
      /*
       * Purge all source nodes from an in6_multi's filter set.
       */
      static void
      in6m_purge(struct in6_multi *inm)
      {
              struct ip6_msource        *ims, *tims;
      
              RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
                      CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
                      RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
                      free(ims, M_IP6MSOURCE);
                      inm->in6m_nsrc--;
              }
              /* Free state-change requests that might be queued. */
              mbufq_drain(&inm->in6m_scq);
      }
      
      /*
       * Join a multicast address w/o sources.
       * KAME compatibility entry point.
       *
       * SMPng: Assume no mc locks held by caller.
       */
      int
      in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr,
          /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
          const int delay)
      {
              int error;
      
              IN6_MULTI_LOCK();
              error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay);
              IN6_MULTI_UNLOCK();
              return (error);
      }
      
      /*
       * Join a multicast group; real entry point.
       *
       * Only preserves atomicity at inm level.
       * NOTE: imf argument cannot be const due to sys/tree.h limitations.
       *
       * If the MLD downcall fails, the group is not joined, and an error
       * code is returned.
       */
      static int
      in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr,
          /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
          const int delay)
   25 {
              struct in6_multi_head    inmh;
              struct in6_mfilter         timf;
              struct in6_multi        *inm;
              struct ifmultiaddr *ifma;
              int                         error;
      #ifdef KTR
              char                         ip6tbuf[INET6_ADDRSTRLEN];
      #endif
      
              /*
               * Sanity: Check scope zone ID was set for ifp, if and
               * only if group is scoped to an interface.
               */
              KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
                  ("%s: not a multicast address", __func__));
    2         if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) ||
                  IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
   23                 KASSERT(mcaddr->s6_addr16[1] != 0,
                          ("%s: scope zone ID not set", __func__));
              }
      
              IN6_MULTI_LOCK_ASSERT();
              IN6_MULTI_LIST_UNLOCK_ASSERT();
      
              CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
                  ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));
      
              error = 0;
              inm = NULL;
      
              /*
               * If no imf was specified (i.e. kernel consumer),
               * fake one up and assume it is an ASM join.
               */
   25         if (imf == NULL) {
                      im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
                      imf = &timf;
              }
   22         error = in6_getmulti(ifp, mcaddr, &inm);
              if (error) {
                      CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__);
                      return (error);
              }
      
              IN6_MULTI_LIST_LOCK();
              CTR1(KTR_MLD, "%s: merge inm state", __func__);
              error = in6m_merge(inm, imf);
              if (error) {
                      CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
                      goto out_in6m_release;
              }
      
              CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
              error = mld_change_state(inm, delay);
              if (error) {
                      CTR1(KTR_MLD, "%s: failed to update source", __func__);
                      goto out_in6m_release;
              }
      
      out_in6m_release:
              SLIST_INIT(&inmh);
              if (error) {
                      struct epoch_tracker et;
      
                      CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
                      IF_ADDR_WLOCK(ifp);
                      NET_EPOCH_ENTER(et);
                      CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
                              if (ifma->ifma_protospec == inm) {
                                      ifma->ifma_protospec = NULL;
                                      break;
                              }
                      }
                      in6m_disconnect_locked(&inmh, inm);
                      in6m_rele_locked(&inmh, inm);
                      NET_EPOCH_EXIT(et);
                      IF_ADDR_WUNLOCK(ifp);
              } else {
                      *pinm = inm;
              }
              IN6_MULTI_LIST_UNLOCK();
   25         in6m_release_list_deferred(&inmh);
              return (error);
      }
      
      /*
       * Leave a multicast group; unlocked entry point.
       */
      int
      in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
      {
              int error;
      
              IN6_MULTI_LOCK();
              error = in6_leavegroup_locked(inm, imf);
              IN6_MULTI_UNLOCK();
              return (error);
      }
      
      /*
       * Leave a multicast group; real entry point.
       * All source filters will be expunged.
       *
       * Only preserves atomicity at inm level.
       *
       * Holding the write lock for the INP which contains imf
       * is highly advisable. We can't assert for it as imf does not
       * contain a back-pointer to the owning inp.
       *
       * Note: This is not the same as in6m_release(*) as this function also
       * makes a state change downcall into MLD.
       */
      int
      in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
    5 {
              struct in6_multi_head         inmh;
              struct in6_mfilter         timf;
              struct ifnet *ifp;
              int                         error;
      #ifdef KTR
              char                         ip6tbuf[INET6_ADDRSTRLEN];
      #endif
      
              error = 0;
      
              IN6_MULTI_LOCK_ASSERT();
      
              CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
                  inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
                  (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
                  imf);
      
              /*
               * If no imf was specified (i.e. kernel consumer),
               * fake one up and assume it is an ASM join.
               */
    5         if (imf == NULL) {
                      im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
                      imf = &timf;
              }
      
              /*
               * Begin state merge transaction at MLD layer.
               *
               * As this particular invocation should not cause any memory
               * to be allocated, and there is no opportunity to roll back
               * the transaction, it MUST NOT fail.
               */
      
              ifp = inm->in6m_ifp;
              IN6_MULTI_LIST_LOCK();
              CTR1(KTR_MLD, "%s: merge inm state", __func__);
              error = in6m_merge(inm, imf);
              KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
      
              CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
              error = 0;
              if (ifp)
    5                 error = mld_change_state(inm, 0);
              if (error)
                      CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
      
              CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
              if (ifp)
                      IF_ADDR_WLOCK(ifp);
      
              SLIST_INIT(&inmh);
    5         if (inm->in6m_refcount == 1)
                      in6m_disconnect_locked(&inmh, inm);
              in6m_rele_locked(&inmh, inm);
              if (ifp)
    5                 IF_ADDR_WUNLOCK(ifp);
              IN6_MULTI_LIST_UNLOCK();
    5         in6m_release_list_deferred(&inmh);
              return (error);
      }
      
      
      /*
       * Block or unblock an ASM multicast source on an inpcb.
       * This implements the delta-based API described in RFC 3678.
       *
       * The delta-based API applies only to exclusive-mode memberships.
       * An MLD downcall will be performed.
       *
       * SMPng: NOTE: Must take Giant as a join may create a new ifma.
       *
       * Return 0 if successful, otherwise return an appropriate error code.
       */
      static int
      in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
      {
              struct group_source_req                 gsr;
              sockunion_t                        *gsa, *ssa;
              struct ifnet                        *ifp;
              struct in6_mfilter                *imf;
              struct ip6_moptions                *imo;
              struct in6_msource                *ims;
              struct in6_multi                        *inm;
              uint16_t                         fmode;
              int                                 error, doblock;
      #ifdef KTR
              char                                 ip6tbuf[INET6_ADDRSTRLEN];
      #endif
      
              ifp = NULL;
              error = 0;
              doblock = 0;
      
              memset(&gsr, 0, sizeof(struct group_source_req));
              gsa = (sockunion_t *)&gsr.gsr_group;
              ssa = (sockunion_t *)&gsr.gsr_source;
      
              switch (sopt->sopt_name) {
              case MCAST_BLOCK_SOURCE:
              case MCAST_UNBLOCK_SOURCE:
                      error = sooptcopyin(sopt, &gsr,
                          sizeof(struct group_source_req),
                          sizeof(struct group_source_req));
    3                 if (error)
                              return (error);
      
    3                 if (gsa->sin6.sin6_family != AF_INET6 ||
                          gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
                              return (EINVAL);
      
    2                 if (ssa->sin6.sin6_family != AF_INET6 ||
                          ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
                              return (EINVAL);
      
    2                 if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
                              return (EADDRNOTAVAIL);
      
                      ifp = ifnet_byindex(gsr.gsr_interface);
      
                      if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
                              doblock = 1;
                      break;
      
              default:
                      CTR2(KTR_MLD, "%s: unknown sopt_name %d",
                          __func__, sopt->sopt_name);
                      return (EOPNOTSUPP);
                      break;
              }
      
    1         if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
                      return (EINVAL);
      
              (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
      
              /*
               * Check if we are actually a member of this group.
               */
              imo = in6p_findmoptions(inp);
              imf = im6o_match_group(imo, ifp, &gsa->sa);
              if (imf == NULL) {
                      error = EADDRNOTAVAIL;
                      goto out_in6p_locked;
              }
              inm = imf->im6f_in6m;
      
              /*
               * Attempting to use the delta-based API on an
               * non exclusive-mode membership is an error.
               */
              fmode = imf->im6f_st[0];
    1         if (fmode != MCAST_EXCLUDE) {
                      error = EINVAL;
                      goto out_in6p_locked;
              }
      
              /*
               * Deal with error cases up-front:
               *  Asked to block, but already blocked; or
               *  Asked to unblock, but nothing to unblock.
               * If adding a new block entry, allocate it.
               */
              ims = im6o_match_source(imf, &ssa->sa);
    5         if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
                      CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
                          ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
                          doblock ? "" : "not ");
                      error = EADDRNOTAVAIL;
                      goto out_in6p_locked;
              }
      
              INP_WLOCK_ASSERT(inp);
      
              /*
               * Begin state merge transaction at socket layer.
               */
              if (doblock) {
                      CTR2(KTR_MLD, "%s: %s source", __func__, "block");
                      ims = im6f_graft(imf, fmode, &ssa->sin6);
                      if (ims == NULL)
                              error = ENOMEM;
              } else {
                      CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
                      error = im6f_prune(imf, &ssa->sin6);
              }
      
              if (error) {
                      CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
                      goto out_im6f_rollback;
              }
      
              /*
               * Begin state merge transaction at MLD layer.
               */
              IN6_MULTI_LIST_LOCK();
              CTR1(KTR_MLD, "%s: merge inm state", __func__);
              error = in6m_merge(inm, imf);
              if (error)
                      CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
              else {
                      CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
                      error = mld_change_state(inm, 0);
                      if (error)
                              CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
              }
      
              IN6_MULTI_LIST_UNLOCK();
      
      out_im6f_rollback:
              if (error)
                      im6f_rollback(imf);
              else
                      im6f_commit(imf);
      
   10         im6f_reap(imf);
      
      out_in6p_locked:
              INP_WUNLOCK(inp);
              return (error);
      }
      
      /*
       * Given an inpcb, return its multicast options structure pointer.  Accepts
       * an unlocked inpcb pointer, but will return it locked.  May sleep.
       *
       * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
       * SMPng: NOTE: Returns with the INP write lock held.
       */
      static struct ip6_moptions *
      in6p_findmoptions(struct inpcb *inp)
   68 {
              struct ip6_moptions         *imo;
      
              INP_WLOCK(inp);
   53         if (inp->in6p_moptions != NULL)
                      return (inp->in6p_moptions);
      
              INP_WUNLOCK(inp);
      
   32         imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
      
              imo->im6o_multicast_ifp = NULL;
              imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
              imo->im6o_multicast_loop = in6_mcast_loop;
              STAILQ_INIT(&imo->im6o_head);
      
              INP_WLOCK(inp);
              if (inp->in6p_moptions != NULL) {
                      free(imo, M_IP6MOPTS);
                      return (inp->in6p_moptions);
              }
              inp->in6p_moptions = imo;
              return (imo);
      }
      
      /*
       * Discard the IPv6 multicast options (and source filters).
       *
       * SMPng: NOTE: assumes INP write lock is held.
       *
       * XXX can all be safely deferred to epoch_call
       *
       */
      
      static void
      inp_gcmoptions(struct ip6_moptions *imo)
      {
              struct in6_mfilter *imf;
              struct in6_multi *inm;
              struct ifnet *ifp;
      
              while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) {
                      ip6_mfilter_remove(&imo->im6o_head, imf);
      
                      im6f_leave(imf);
                      if ((inm = imf->im6f_in6m) != NULL) {
                              if ((ifp = inm->in6m_ifp) != NULL) {
                                      CURVNET_SET(ifp->if_vnet);
                                      (void)in6_leavegroup(inm, imf);
                                      CURVNET_RESTORE();
                              } else {
                                      (void)in6_leavegroup(inm, imf);
                              }
                      }
                      ip6_mfilter_free(imf);
              }
              free(imo, M_IP6MOPTS);
      }
      
      void
      ip6_freemoptions(struct ip6_moptions *imo)
      {
              if (imo == NULL)
                      return;
              inp_gcmoptions(imo);
      }
      
      /*
       * Atomically get source filters on a socket for an IPv6 multicast group.
       * Called with INP lock held; returns with lock released.
       */
      static int
      in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
      {
              struct __msfilterreq         msfr;
              sockunion_t                *gsa;
              struct ifnet                *ifp;
              struct ip6_moptions        *imo;
              struct in6_mfilter        *imf;
              struct ip6_msource        *ims;
              struct in6_msource        *lims;
              struct sockaddr_in6        *psin;
              struct sockaddr_storage        *ptss;
              struct sockaddr_storage        *tss;
              int                         error;
              size_t                         nsrcs, ncsrcs;
      
              INP_WLOCK_ASSERT(inp);
      
              imo = inp->in6p_moptions;
              KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));
      
              INP_WUNLOCK(inp);
      
              error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
                  sizeof(struct __msfilterreq));
              if (error)
                      return (error);
      
    1         if (msfr.msfr_group.ss_family != AF_INET6 ||
                  msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
                      return (EINVAL);
      
              gsa = (sockunion_t *)&msfr.msfr_group;
              if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
                      return (EINVAL);
      
              if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
                      return (EADDRNOTAVAIL);
              ifp = ifnet_byindex(msfr.msfr_ifindex);
              if (ifp == NULL)
                      return (EADDRNOTAVAIL);
              (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
      
              INP_WLOCK(inp);
      
              /*
               * Lookup group on the socket.
               */
              imf = im6o_match_group(imo, ifp, &gsa->sa);
              if (imf == NULL) {
                      INP_WUNLOCK(inp);
                      return (EADDRNOTAVAIL);
              }
      
              /*
               * Ignore memberships which are in limbo.
               */
              if (imf->im6f_st[1] == MCAST_UNDEFINED) {
                      INP_WUNLOCK(inp);
                      return (EAGAIN);
              }
              msfr.msfr_fmode = imf->im6f_st[1];
      
              /*
               * If the user specified a buffer, copy out the source filter
               * entries to userland gracefully.
               * We only copy out the number of entries which userland
               * has asked for, but we always tell userland how big the
               * buffer really needs to be.
               */
              if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
                      msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
              tss = NULL;
              if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
                      tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
                          M_TEMP, M_NOWAIT | M_ZERO);
                      if (tss == NULL) {
                              INP_WUNLOCK(inp);
                              return (ENOBUFS);
                      }
              }
      
              /*
               * Count number of sources in-mode at t0.
               * If buffer space exists and remains, copy out source entries.
               */
              nsrcs = msfr.msfr_nsrcs;
              ncsrcs = 0;
              ptss = tss;
              RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
                      lims = (struct in6_msource *)ims;
                      if (lims->im6sl_st[0] == MCAST_UNDEFINED ||
                          lims->im6sl_st[0] != imf->im6f_st[0])
                              continue;
                      ++ncsrcs;
                      if (tss != NULL && nsrcs > 0) {
                              psin = (struct sockaddr_in6 *)ptss;
                              psin->sin6_family = AF_INET6;
                              psin->sin6_len = sizeof(struct sockaddr_in6);
                              psin->sin6_addr = lims->im6s_addr;
                              psin->sin6_port = 0;
                              --nsrcs;
                              ++ptss;
                      }
              }
      
              INP_WUNLOCK(inp);
      
              if (tss != NULL) {
                      error = copyout(tss, msfr.msfr_srcs,
                          sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
                      free(tss, M_TEMP);
                      if (error)
                              return (error);
              }
      
              msfr.msfr_nsrcs = ncsrcs;
              error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
      
              return (error);
      }
      
      /*
       * Return the IP multicast options in response to user getsockopt().
       */
      int
      ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
   10 {
              struct ip6_moptions        *im6o;
              int                         error;
              u_int                         optval;
      
              INP_WLOCK(inp);
              im6o = inp->in6p_moptions;
              /*
               * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
               * or is a divert socket, reject it.
               */
              if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
    1             (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
                  inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
                      INP_WUNLOCK(inp);
                      return (EOPNOTSUPP);
              }
      
              error = 0;
              switch (sopt->sopt_name) {
              case IPV6_MULTICAST_IF:
    2                 if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) {
                              optval = 0;
                      } else {
    1                         optval = im6o->im6o_multicast_ifp->if_index;
                      }
                      INP_WUNLOCK(inp);
                      error = sooptcopyout(sopt, &optval, sizeof(u_int));
                      break;
      
              case IPV6_MULTICAST_HOPS:
                      if (im6o == NULL)
    1                         optval = V_ip6_defmcasthlim;
                      else
    1                         optval = im6o->im6o_multicast_hlim;
                      INP_WUNLOCK(inp);
                      error = sooptcopyout(sopt, &optval, sizeof(u_int));
                      break;
      
              case IPV6_MULTICAST_LOOP:
                      if (im6o == NULL)
    1                         optval = in6_mcast_loop; /* XXX VIMAGE */
                      else
    1                         optval = im6o->im6o_multicast_loop;
                      INP_WUNLOCK(inp);
                      error = sooptcopyout(sopt, &optval, sizeof(u_int));
                      break;
      
              case IPV6_MSFILTER:
                      if (im6o == NULL) {
                              error = EADDRNOTAVAIL;
                              INP_WUNLOCK(inp);
                      } else {
                              error = in6p_get_source_filters(inp, sopt);
                      }
                      break;
      
              default:
                      INP_WUNLOCK(inp);
                      error = ENOPROTOOPT;
                      break;
              }
      
              INP_UNLOCK_ASSERT(inp);
      
              return (error);
      }
      
      /*
       * Look up the ifnet to use for a multicast group membership,
       * given the address of an IPv6 group.
       *
       * This routine exists to support legacy IPv6 multicast applications.
       *
       * If inp is non-NULL, use this socket's current FIB number for any
       * required FIB lookup. Look up the group address in the unicast FIB,
       * and use its ifp; usually, this points to the default next-hop.
       * If the FIB lookup fails, return NULL.
       *
       * FUTURE: Support multiple forwarding tables for IPv6.
       *
       * Returns NULL if no ifp could be found.
       */
      static struct ifnet *
      in6p_lookup_mcast_ifp(const struct inpcb *inp,
          const struct sockaddr_in6 *gsin6)
      {
              struct nhop6_basic        nh6;
              struct in6_addr                dst;
              uint32_t                scopeid;
              uint32_t                fibnum;
      
              KASSERT(inp->inp_vflag & INP_IPV6,
                  ("%s: not INP_IPV6 inpcb", __func__));
              KASSERT(gsin6->sin6_family == AF_INET6,
                  ("%s: not AF_INET6 group", __func__));
      
    3         in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
              fibnum = inp ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB;
              if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0)
                      return (NULL);
      
              return (nh6.nh_ifp);
      }
      
      /*
       * Join an IPv6 multicast group, possibly with a source.
       *
       * FIXME: The KAME use of the unspecified address (::)
       * to join *all* multicast groups is currently unsupported.
       */
      static int
      in6p_join_group(struct inpcb *inp, struct sockopt *sopt)
      {
              struct in6_multi_head                 inmh;
              struct group_source_req                 gsr;
              sockunion_t                        *gsa, *ssa;
              struct ifnet                        *ifp;
              struct in6_mfilter                *imf;
              struct ip6_moptions                *imo;
              struct in6_multi                *inm;
              struct in6_msource                *lims;
              int                                 error, is_new;
      
              SLIST_INIT(&inmh);
              ifp = NULL;
              lims = NULL;
              error = 0;
      
              memset(&gsr, 0, sizeof(struct group_source_req));
              gsa = (sockunion_t *)&gsr.gsr_group;
              gsa->ss.ss_family = AF_UNSPEC;
              ssa = (sockunion_t *)&gsr.gsr_source;
              ssa->ss.ss_family = AF_UNSPEC;
      
              /*
               * Chew everything into struct group_source_req.
               * Overwrite the port field if present, as the sockaddr
               * being copied in may be matched with a binary comparison.
               * Ignore passed-in scope ID.
               */
              switch (sopt->sopt_name) {
              case IPV6_JOIN_GROUP: {
                      struct ipv6_mreq mreq;
      
                      error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
                          sizeof(struct ipv6_mreq));
    3                 if (error)
                              return (error);
      
                      gsa->sin6.sin6_family = AF_INET6;
                      gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
                      gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
      
                      if (mreq.ipv6mr_interface == 0) {
                              ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
                      } else {
    1                         if (V_if_index < mreq.ipv6mr_interface)
                                      return (EADDRNOTAVAIL);
    1                         ifp = ifnet_byindex(mreq.ipv6mr_interface);
                      }
                      CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
                          __func__, mreq.ipv6mr_interface, ifp);
              } break;
      
              case MCAST_JOIN_GROUP:
              case MCAST_JOIN_SOURCE_GROUP:
                      if (sopt->sopt_name == MCAST_JOIN_GROUP) {
   20                         error = sooptcopyin(sopt, &gsr,
                                  sizeof(struct group_req),
                                  sizeof(struct group_req));
                      } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
   28                         error = sooptcopyin(sopt, &gsr,
                                  sizeof(struct group_source_req),
                                  sizeof(struct group_source_req));
                      }
    3                 if (error)
                              return (error);
      
    3                 if (gsa->sin6.sin6_family != AF_INET6 ||
                          gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
                              return (EINVAL);
      
   16                 if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
    2                         if (ssa->sin6.sin6_family != AF_INET6 ||
                                  ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
                                      return (EINVAL);
   24                         if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
                                      return (EINVAL);
                              /*
                               * TODO: Validate embedded scope ID in source
                               * list entry against passed-in ifp, if and only
                               * if source list filter entry is iface or node local.
                               */
                              in6_clearscope(&ssa->sin6.sin6_addr);
                              ssa->sin6.sin6_port = 0;
                              ssa->sin6.sin6_scope_id = 0;
                      }
      
    8                 if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
                              return (EADDRNOTAVAIL);
   31                 ifp = ifnet_byindex(gsr.gsr_interface);
                      break;
      
              default:
                      CTR2(KTR_MLD, "%s: unknown sopt_name %d",
                          __func__, sopt->sopt_name);
                      return (EOPNOTSUPP);
                      break;
              }
      
    2         if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
                      return (EINVAL);
      
    1         if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
                      return (EADDRNOTAVAIL);
      
              gsa->sin6.sin6_port = 0;
              gsa->sin6.sin6_scope_id = 0;
      
              /*
               * Always set the scope zone ID on memberships created from userland.
               * Use the passed-in ifp to do this.
               * XXX The in6_setscope() return value is meaningless.
               * XXX SCOPE6_LOCK() is taken by in6_setscope().
               */
              (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
      
              IN6_MULTI_LOCK();
      
              /*
               * Find the membership in the membership list.
               */
              imo = in6p_findmoptions(inp);
              imf = im6o_match_group(imo, ifp, &gsa->sa);
              if (imf == NULL) {
                      is_new = 1;
                      inm = NULL;
      
    6                 if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) {
                              error = ENOMEM;
                              goto out_in6p_locked;
                      }
              } else {
                      is_new = 0;
                      inm = imf->im6f_in6m;
      
    2                 if (ssa->ss.ss_family != AF_UNSPEC) {
                              /*
                               * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
                               * is an error. On an existing inclusive membership,
                               * it just adds the source to the filter list.
                               */
    1                         if (imf->im6f_st[1] != MCAST_INCLUDE) {
                                      error = EINVAL;
                                      goto out_in6p_locked;
                              }
                              /*
                               * Throw out duplicates.
                               *
                               * XXX FIXME: This makes a naive assumption that
                               * even if entries exist for *ssa in this imf,
                               * they will be rejected as dupes, even if they
                               * are not valid in the current mode (in-mode).
                               *
                               * in6_msource is transactioned just as for anything
                               * else in SSM -- but note naive use of in6m_graft()
                               * below for allocating new filter entries.
                               *
                               * This is only an issue if someone mixes the
                               * full-state SSM API with the delta-based API,
                               * which is discouraged in the relevant RFCs.
                               */
                              lims = im6o_match_source(imf, &ssa->sa);
                              if (lims != NULL /*&&
                                  lims->im6sl_st[1] == MCAST_INCLUDE*/) {
                                      error = EADDRNOTAVAIL;
                                      goto out_in6p_locked;
                              }
                      } else {
                              /*
                               * MCAST_JOIN_GROUP alone, on any existing membership,
                               * is rejected, to stop the same inpcb tying up
                               * multiple refs to the in_multi.
                               * On an existing inclusive membership, this is also
                               * an error; if you want to change filter mode,
                               * you must use the userland API setsourcefilter().
                               * XXX We don't reject this for imf in UNDEFINED
                               * state at t1, because allocation of a filter
                               * is atomic with allocation of a membership.
                               */
                              error = EINVAL;
                              goto out_in6p_locked;
                      }
              }
      
              /*
               * Begin state merge transaction at socket layer.
               */
              INP_WLOCK_ASSERT(inp);
      
              /*
               * Graft new source into filter list for this inpcb's
               * membership of the group. The in6_multi may not have
               * been allocated yet if this is a new membership, however,
               * the in_mfilter slot will be allocated and must be initialized.
               *
               * Note: Grafting of exclusive mode filters doesn't happen
               * in this path.
               * XXX: Should check for non-NULL lims (node exists but may
               * not be in-mode) for interop with full-state API.
               */
              if (ssa->ss.ss_family != AF_UNSPEC) {
                      /* Membership starts in IN mode */
    6                 if (is_new) {
                              CTR1(KTR_MLD, "%s: new join w/source", __func__);
                              imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE);
                              if (imf == NULL) {
                                      error = ENOMEM;
                                      goto out_in6p_locked;
                              }
                      } else {
                              CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
                      }
                      lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
                      if (lims == NULL) {
                              CTR1(KTR_MLD, "%s: merge imf state failed",
                                  __func__);
                              error = ENOMEM;
                              goto out_in6p_locked;
                      }
              } else {
                      /* No address specified; Membership starts in EX mode */
                      if (is_new) {
                              CTR1(KTR_MLD, "%s: new join w/o source", __func__);
                              imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE);
                              if (imf == NULL) {
                                      error = ENOMEM;
                                      goto out_in6p_locked;
                              }
                      }
              }
      
              /*
               * Begin state merge transaction at MLD layer.
               */
   15         if (is_new) {
                      in_pcbref(inp);
                      INP_WUNLOCK(inp);
      
                      error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf,
                          &imf->im6f_in6m, 0);
      
                      INP_WLOCK(inp);
                      if (in_pcbrele_wlocked(inp)) {
                              error = ENXIO;
                              goto out_in6p_unlocked;
                      }
                      if (error) {
                              goto out_in6p_locked;
                      }
                      /*
                       * NOTE: Refcount from in6_joingroup_locked()
                       * is protecting membership.
                       */
   25                 ip6_mfilter_insert(&imo->im6o_head, imf);
              } else {
                      CTR1(KTR_MLD, "%s: merge inm state", __func__);
                      IN6_MULTI_LIST_LOCK();
                      error = in6m_merge(inm, imf);
                      if (error) {
                              CTR1(KTR_MLD, "%s: failed to merge inm state",
                                  __func__);
                              IN6_MULTI_LIST_UNLOCK();
                              im6f_rollback(imf);
                              im6f_reap(imf);
                              goto out_in6p_locked;
                      }
                      CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
                      error = mld_change_state(inm, 0);
                      IN6_MULTI_LIST_UNLOCK();
      
    6                 if (error) {
                              CTR1(KTR_MLD, "%s: failed mld downcall",
                                   __func__);
                              im6f_rollback(imf);
                              im6f_reap(imf);
                              goto out_in6p_locked;
                      }
              }
      
              im6f_commit(imf);
              imf = NULL;
      
      out_in6p_locked:
              INP_WUNLOCK(inp);
      out_in6p_unlocked:
              IN6_MULTI_UNLOCK();
      
   32         if (is_new && imf) {
                      if (imf->im6f_in6m != NULL) {
                              struct in6_multi_head inmh;
      
                              SLIST_INIT(&inmh);
                              SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer);
                              in6m_release_list_deferred(&inmh);
                      }
                      ip6_mfilter_free(imf);
              }
              return (error);
      }
      
      /*
       * Leave an IPv6 multicast group on an inpcb, possibly with a source.
       */
      static int
      in6p_leave_group(struct inpcb *inp, struct sockopt *sopt)
      {
              struct ipv6_mreq                 mreq;
              struct group_source_req                 gsr;
              sockunion_t                        *gsa, *ssa;
              struct ifnet                        *ifp;
              struct in6_mfilter                *imf;
              struct ip6_moptions                *imo;
              struct in6_msource                *ims;
              struct in6_multi                *inm;
              uint32_t                         ifindex;
              int                                 error;
              bool                                 is_final;
      #ifdef KTR
              char                                 ip6tbuf[INET6_ADDRSTRLEN];
      #endif
      
              ifp = NULL;
              ifindex = 0;
              error = 0;
              is_final = true;
      
              memset(&gsr, 0, sizeof(struct group_source_req));
              gsa = (sockunion_t *)&gsr.gsr_group;
              gsa->ss.ss_family = AF_UNSPEC;
              ssa = (sockunion_t *)&gsr.gsr_source;
              ssa->ss.ss_family = AF_UNSPEC;
      
              /*
               * Chew everything passed in up into a struct group_source_req
               * as that is easier to process.
               * Note: Any embedded scope ID in the multicast group passed
               * in by userland is ignored, the interface index is the recommended
               * mechanism to specify an interface; see below.
               */
              switch (sopt->sopt_name) {
              case IPV6_LEAVE_GROUP:
                      error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
                          sizeof(struct ipv6_mreq));
    3                 if (error)
                              return (error);
    5                 gsa->sin6.sin6_family = AF_INET6;
                      gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
                      gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
                      gsa->sin6.sin6_port = 0;
                      gsa->sin6.sin6_scope_id = 0;
                      ifindex = mreq.ipv6mr_interface;
                      break;
      
              case MCAST_LEAVE_GROUP:
              case MCAST_LEAVE_SOURCE_GROUP:
                      if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
   14                         error = sooptcopyin(sopt, &gsr,
                                  sizeof(struct group_req),
                                  sizeof(struct group_req));
                      } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
   27                         error = sooptcopyin(sopt, &gsr,
                                  sizeof(struct group_source_req),
                                  sizeof(struct group_source_req));
                      }
    3                 if (error)
                              return (error);
      
    3                 if (gsa->sin6.sin6_family != AF_INET6 ||
                          gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
                              return (EINVAL);
   10                 if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
    2                         if (ssa->sin6.sin6_family != AF_INET6 ||
                                  ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
                                      return (EINVAL);
   23                         if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
                                      return (EINVAL);
                              /*
                               * TODO: Validate embedded scope ID in source
                               * list entry against passed-in ifp, if and only
                               * if source list filter entry is iface or node local.
                               */
                              in6_clearscope(&ssa->sin6.sin6_addr);
                      }
                      gsa->sin6.sin6_port = 0;
                      gsa->sin6.sin6_scope_id = 0;
                      ifindex = gsr.gsr_interface;
                      break;
      
              default:
                      CTR2(KTR_MLD, "%s: unknown sopt_name %d",
                          __func__, sopt->sopt_name);
                      return (EOPNOTSUPP);
                      break;
              }
      
    5         if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
                      return (EINVAL);
      
              /*
               * Validate interface index if provided. If no interface index
               * was provided separately, attempt to look the membership up
               * from the default scope as a last resort to disambiguate
               * the membership we are being asked to leave.
               * XXX SCOPE6 lock potentially taken here.
               */
              if (ifindex != 0) {
    5                 if (V_if_index < ifindex)
                              return (EADDRNOTAVAIL);
                      ifp = ifnet_byindex(ifindex);
                      if (ifp == NULL)
                              return (EADDRNOTAVAIL);
   20                 (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
              } else {
                      error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
                      if (error)
                              return (EADDRNOTAVAIL);
                      /*
                       * Some badly behaved applications don't pass an ifindex
                       * or a scope ID, which is an API violation. In this case,
                       * perform a lookup as per a v6 join.
                       *
                       * XXX For now, stomp on zone ID for the corner case.
                       * This is not the 'KAME way', but we need to see the ifp
                       * directly until such time as this implementation is
                       * refactored, assuming the scope IDs are the way to go.
                       */
    2                 ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
                      if (ifindex == 0) {
                              CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
                                  "ifp for group %s.", __func__,
                                  ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
                              ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
                      } else {
                              ifp = ifnet_byindex(ifindex);
                      }
    6                 if (ifp == NULL)
                              return (EADDRNOTAVAIL);
              }
      
              CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
              KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));
      
              IN6_MULTI_LOCK();
      
              /*
               * Find the membership in the membership list.
               */
              imo = in6p_findmoptions(inp);
              imf = im6o_match_group(imo, ifp, &gsa->sa);
              if (imf == NULL) {
                      error = EADDRNOTAVAIL;
                      goto out_in6p_locked;
              }
              inm = imf->im6f_in6m;
      
              if (ssa->ss.ss_family != AF_UNSPEC)
                      is_final = false;
      
              /*
               * Begin state merge transaction at socket layer.
               */
              INP_WLOCK_ASSERT(inp);
      
              /*
               * If we were instructed only to leave a given source, do so.
               * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
               */
              if (is_final) {
                      ip6_mfilter_remove(&imo->im6o_head, imf);
                      im6f_leave(imf);
      
                      /*
                       * Give up the multicast address record to which
                       * the membership points.
                       */
                      (void)in6_leavegroup_locked(inm, imf);
              } else {
    1                 if (imf->im6f_st[0] == MCAST_EXCLUDE) {
                              error = EADDRNOTAVAIL;
                              goto out_in6p_locked;
                      }
                      ims = im6o_match_source(imf, &ssa->sa);
                      if (ims == NULL) {
                              CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
                                  ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
                                  "not ");
                              error = EADDRNOTAVAIL;
                              goto out_in6p_locked;
                      }
                      CTR2(KTR_MLD, "%s: %s source", __func__, "block");
                      error = im6f_prune(imf, &ssa->sin6);
                      if (error) {
                              CTR1(KTR_MLD, "%s: merge imf state failed",
                                  __func__);
                              goto out_in6p_locked;
                      }
              }
      
              /*
               * Begin state merge transaction at MLD layer.
               */
    5         if (!is_final) {
                      CTR1(KTR_MLD, "%s: merge inm state", __func__);
                      IN6_MULTI_LIST_LOCK();
                      error = in6m_merge(inm, imf);
                      if (error) {
                              CTR1(KTR_MLD, "%s: failed to merge inm state",
                                  __func__);
                              IN6_MULTI_LIST_UNLOCK();
                              im6f_rollback(imf);
                              im6f_reap(imf);
                              goto out_in6p_locked;
                      }
      
                      CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
                      error = mld_change_state(inm, 0);
                      IN6_MULTI_LIST_UNLOCK();
    4                 if (error) {
                              CTR1(KTR_MLD, "%s: failed mld downcall",
                                   __func__);
                              im6f_rollback(imf);
                              im6f_reap(imf);
                              goto out_in6p_locked;
                      }
              }
      
    2         im6f_commit(imf);
    6         im6f_reap(imf);
      
      out_in6p_locked:
              INP_WUNLOCK(inp);
      
   20         if (is_final && imf)
    5                 ip6_mfilter_free(imf);
      
              IN6_MULTI_UNLOCK();
              return (error);
      }
      
      /*
       * Select the interface for transmitting IPv6 multicast datagrams.
       *
       * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
       * may be passed to this socket option. An address of in6addr_any or an
       * interface index of 0 is used to remove a previous selection.
       * When no interface is selected, one is chosen for every send.
       */
      static int
      in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
      {
              struct ifnet                *ifp;
              struct ip6_moptions        *imo;
              u_int                         ifindex;
              int                         error;
      
    1         if (sopt->sopt_valsize != sizeof(u_int))
                      return (EINVAL);
      
              error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
    1         if (error)
                      return (error);
    2         if (V_if_index < ifindex)
                      return (EINVAL);
    2         if (ifindex == 0)
                      ifp = NULL;
              else {
                      ifp = ifnet_byindex(ifindex);
                      if (ifp == NULL)
                              return (EINVAL);
    1                 if ((ifp->if_flags & IFF_MULTICAST) == 0)
                              return (EADDRNOTAVAIL);
              }
              imo = in6p_findmoptions(inp);
              imo->im6o_multicast_ifp = ifp;
              INP_WUNLOCK(inp);
      
              return (0);
      }
      
      /*
       * Atomically set source filters on a socket for an IPv6 multicast group.
       *
       * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
       */
      static int
      in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
      {
              struct __msfilterreq         msfr;
              sockunion_t                *gsa;
              struct ifnet                *ifp;
              struct in6_mfilter        *imf;
              struct ip6_moptions        *imo;
              struct in6_multi                *inm;
              int                         error;
      
              error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
                  sizeof(struct __msfilterreq));
    3         if (error)
                      return (error);
      
    1         if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
                      return (ENOBUFS);
      
    2         if (msfr.msfr_fmode != MCAST_EXCLUDE &&
                  msfr.msfr_fmode != MCAST_INCLUDE)
                      return (EINVAL);
      
              if (msfr.msfr_group.ss_family != AF_INET6 ||
                  msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
                      return (EINVAL);
      
              gsa = (sockunion_t *)&msfr.msfr_group;
              if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
                      return (EINVAL);
      
              gsa->sin6.sin6_port = 0;        /* ignore port */
      
              if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
                      return (EADDRNOTAVAIL);
              ifp = ifnet_byindex(msfr.msfr_ifindex);
              if (ifp == NULL)
                      return (EADDRNOTAVAIL);
              (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
      
              /*
               * Take the INP write lock.
               * Check if this socket is a member of this group.
               */
              imo = in6p_findmoptions(inp);
              imf = im6o_match_group(imo, ifp, &gsa->sa);
              if (imf == NULL) {
                      error = EADDRNOTAVAIL;
                      goto out_in6p_locked;
              }
              inm = imf->im6f_in6m;
      
              /*
               * Begin state merge transaction at socket layer.
               */
              INP_WLOCK_ASSERT(inp);
      
              imf->im6f_st[1] = msfr.msfr_fmode;
      
              /*
               * Apply any new source filters, if present.
               * Make a copy of the user-space source vector so
               * that we may copy them with a single copyin. This
               * allows us to deal with page faults up-front.
               */
              if (msfr.msfr_nsrcs > 0) {
                      struct in6_msource        *lims;
                      struct sockaddr_in6        *psin;
                      struct sockaddr_storage        *kss, *pkss;
                      int                         i;
      
                      INP_WUNLOCK(inp);
       
                      CTR2(KTR_MLD, "%s: loading %lu source list entries",
                          __func__, (unsigned long)msfr.msfr_nsrcs);
                      kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
                          M_TEMP, M_WAITOK);
                      error = copyin(msfr.msfr_srcs, kss,
                          sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
                      if (error) {
                              free(kss, M_TEMP);
                              return (error);
                      }
      
                      INP_WLOCK(inp);
      
                      /*
                       * Mark all source filters as UNDEFINED at t1.
                       * Restore new group filter mode, as im6f_leave()
                       * will set it to INCLUDE.
                       */
                      im6f_leave(imf);
                      imf->im6f_st[1] = msfr.msfr_fmode;
      
                      /*
                       * Update socket layer filters at t1, lazy-allocating
                       * new entries. This saves a bunch of memory at the
                       * cost of one RB_FIND() per source entry; duplicate
                       * entries in the msfr_nsrcs vector are ignored.
                       * If we encounter an error, rollback transaction.
                       *
                       * XXX This too could be replaced with a set-symmetric
                       * difference like loop to avoid walking from root
                       * every time, as the key space is common.
                       */
                      for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
                              psin = (struct sockaddr_in6 *)pkss;
                              if (psin->sin6_family != AF_INET6) {
                                      error = EAFNOSUPPORT;
                                      break;
                              }
                              if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
                                      error = EINVAL;
                                      break;
                              }
                              if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
                                      error = EINVAL;
                                      break;
                              }
                              /*
                               * TODO: Validate embedded scope ID in source
                               * list entry against passed-in ifp, if and only
                               * if source list filter entry is iface or node local.
                               */
                              in6_clearscope(&psin->sin6_addr);
                              error = im6f_get_source(imf, psin, &lims);
                              if (error)
                                      break;
                              lims->im6sl_st[1] = imf->im6f_st[1];
                      }
                      free(kss, M_TEMP);
              }
      
              if (error)
                      goto out_im6f_rollback;
      
              INP_WLOCK_ASSERT(inp);
              IN6_MULTI_LIST_LOCK();
      
              /*
               * Begin state merge transaction at MLD layer.
               */
              CTR1(KTR_MLD, "%s: merge inm state", __func__);
              error = in6m_merge(inm, imf);
              if (error)
                      CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
              else {
                      CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
                      error = mld_change_state(inm, 0);
                      if (error)
                              CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
              }
      
              IN6_MULTI_LIST_UNLOCK();
      
      out_im6f_rollback:
              if (error)
                      im6f_rollback(imf);
              else
                      im6f_commit(imf);
      
              im6f_reap(imf);
      
      out_in6p_locked:
              INP_WUNLOCK(inp);
              return (error);
      }
      
      /*
       * Set the IP multicast options in response to user setsockopt().
       *
       * Many of the socket options handled in this function duplicate the
       * functionality of socket options in the regular unicast API. However,
       * it is not possible to merge the duplicate code, because the idempotence
       * of the IPv6 multicast part of the BSD Sockets API must be preserved;
       * the effects of these options must be treated as separate and distinct.
       *
       * SMPng: XXX: Unlocked read of inp_socket believed OK.
       */
      int
      ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
  146 {
              struct ip6_moptions        *im6o;
              int                         error;
      
              error = 0;
      
              /*
               * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
               * or is a divert socket, reject it.
               */
              if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
    2             (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
                   inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
                      return (EOPNOTSUPP);
      
  119         switch (sopt->sopt_name) {
              case IPV6_MULTICAST_IF:
                      error = in6p_set_multicast_if(inp, sopt);
                      break;
      
              case IPV6_MULTICAST_HOPS: {
                      int hlim;
      
    1                 if (sopt->sopt_valsize != sizeof(int)) {
                              error = EINVAL;
                              break;
                      }
                      error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
    2                 if (error)
                              break;
    1                 if (hlim < -1 || hlim > 255) {
                              error = EINVAL;
                              break;
    1                 } else if (hlim == -1) {
    1                         hlim = V_ip6_defmcasthlim;
                      }
                      im6o = in6p_findmoptions(inp);
                      im6o->im6o_multicast_hlim = hlim;
                      INP_WUNLOCK(inp);
                      break;
              }
      
              case IPV6_MULTICAST_LOOP: {
                      u_int loop;
      
                      /*
                       * Set the loopback flag for outgoing multicast packets.
                       * Must be zero or one.
                       */
    1                 if (sopt->sopt_valsize != sizeof(u_int)) {
                              error = EINVAL;
                              break;
                      }
                      error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
    2                 if (error)
                              break;
    1                 if (loop > 1) {
                              error = EINVAL;
                              break;
                      }
    2                 im6o = in6p_findmoptions(inp);
                      im6o->im6o_multicast_loop = loop;
                      INP_WUNLOCK(inp);
                      break;
              }
      
              case IPV6_JOIN_GROUP:
              case MCAST_JOIN_GROUP:
              case MCAST_JOIN_SOURCE_GROUP:
   25                 error = in6p_join_group(inp, sopt);
                      break;
      
              case IPV6_LEAVE_GROUP:
              case MCAST_LEAVE_GROUP:
              case MCAST_LEAVE_SOURCE_GROUP:
    6                 error = in6p_leave_group(inp, sopt);
                      break;
      
              case MCAST_BLOCK_SOURCE:
              case MCAST_UNBLOCK_SOURCE:
   10                 error = in6p_block_unblock_source(inp, sopt);
                      break;
      
              case IPV6_MSFILTER:
                      error = in6p_set_source_filters(inp, sopt);
                      break;
      
              default:
                      error = EOPNOTSUPP;
                      break;
              }
      
              INP_UNLOCK_ASSERT(inp);
      
              return (error);
      }
      
      /*
       * Expose MLD's multicast filter mode and source list(s) to userland,
       * keyed by (ifindex, group).
       * The filter mode is written out as a uint32_t, followed by
       * 0..n of struct in6_addr.
       * For use by ifmcstat(8).
       * SMPng: NOTE: unlocked read of ifindex space.
       */
      static int
      sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
      {
              struct in6_addr                         mcaddr;
              struct in6_addr                         src;
              struct epoch_tracker                 et;
              struct ifnet                        *ifp;
              struct ifmultiaddr                *ifma;
              struct in6_multi                *inm;
              struct ip6_msource                *ims;
              int                                *name;
              int                                 retval;
              u_int                                 namelen;
              uint32_t                         fmode, ifindex;
      #ifdef KTR
              char                                 ip6tbuf[INET6_ADDRSTRLEN];
      #endif
      
              name = (int *)arg1;
              namelen = arg2;
      
              if (req->newptr != NULL)
                      return (EPERM);
      
              /* int: ifindex + 4 * 32 bits of IPv6 address */
              if (namelen != 5)
                      return (EINVAL);
      
              ifindex = name[0];
              if (ifindex <= 0 || ifindex > V_if_index) {
                      CTR2(KTR_MLD, "%s: ifindex %u out of range",
                          __func__, ifindex);
                      return (ENOENT);
              }
      
              memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
              if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
                      CTR2(KTR_MLD, "%s: group %s is not multicast",
                          __func__, ip6_sprintf(ip6tbuf, &mcaddr));
                      return (EINVAL);
              }
      
              NET_EPOCH_ENTER(et);
              ifp = ifnet_byindex(ifindex);
              if (ifp == NULL) {
                      NET_EPOCH_EXIT(et);
                      CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
                          __func__, ifindex);
                      return (ENOENT);
              }
              /*
               * Internal MLD lookups require that scope/zone ID is set.
               */
              (void)in6_setscope(&mcaddr, ifp, NULL);
      
              retval = sysctl_wire_old_buffer(req,
                  sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
              if (retval) {
                      NET_EPOCH_EXIT(et);
                      return (retval);
              }
      
              IN6_MULTI_LOCK();
              IN6_MULTI_LIST_LOCK();
              CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
                      inm = in6m_ifmultiaddr_get_inm(ifma);
                      if (inm == NULL)
                              continue;
                      if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
                              continue;
                      fmode = inm->in6m_st[1].iss_fmode;
                      retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
                      if (retval != 0)
                              break;
                      RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
                              CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
                              /*
                               * Only copy-out sources which are in-mode.
                               */
                              if (fmode != im6s_get_mode(inm, ims, 1)) {
                                      CTR1(KTR_MLD, "%s: skip non-in-mode",
                                          __func__);
                                      continue;
                              }
                              src = ims->im6s_addr;
                              retval = SYSCTL_OUT(req, &src,
                                  sizeof(struct in6_addr));
                              if (retval != 0)
                                      break;
                      }
              }
              IN6_MULTI_LIST_UNLOCK();
              IN6_MULTI_UNLOCK();
              NET_EPOCH_EXIT(et);
      
              return (retval);
      }
      
      #ifdef KTR
      
      static const char *in6m_modestrs[] = { "un", "in", "ex" };
      
      static const char *
      in6m_mode_str(const int mode)
      {
      
              if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
                      return (in6m_modestrs[mode]);
              return ("??");
      }
      
      static const char *in6m_statestrs[] = {
              "not-member",
              "silent",
              "idle",
              "lazy",
              "sleeping",
              "awakening",
              "query-pending",
              "sg-query-pending",
              "leaving"
      };
      
      static const char *
      in6m_state_str(const int state)
      {
      
              if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
                      return (in6m_statestrs[state]);
              return ("??");
      }
      
      /*
       * Dump an in6_multi structure to the console.
       */
      void
      in6m_print(const struct in6_multi *inm)
      {
              int t;
              char ip6tbuf[INET6_ADDRSTRLEN];
      
              if ((ktr_mask & KTR_MLD) == 0)
                      return;
      
              printf("%s: --- begin in6m %p ---\n", __func__, inm);
              printf("addr %s ifp %p(%s) ifma %p\n",
                  ip6_sprintf(ip6tbuf, &inm->in6m_addr),
                  inm->in6m_ifp,
                  if_name(inm->in6m_ifp),
                  inm->in6m_ifma);
              printf("timer %u state %s refcount %u scq.len %u\n",
                  inm->in6m_timer,
                  in6m_state_str(inm->in6m_state),
                  inm->in6m_refcount,
                  mbufq_len(&inm->in6m_scq));
              printf("mli %p nsrc %lu sctimer %u scrv %u\n",
                  inm->in6m_mli,
                  inm->in6m_nsrc,
                  inm->in6m_sctimer,
                  inm->in6m_scrv);
              for (t = 0; t < 2; t++) {
                      printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
                          in6m_mode_str(inm->in6m_st[t].iss_fmode),
                          inm->in6m_st[t].iss_asm,
                          inm->in6m_st[t].iss_ex,
                          inm->in6m_st[t].iss_in,
                          inm->in6m_st[t].iss_rec);
              }
              printf("%s: --- end in6m %p ---\n", __func__, inm);
      }
      
      #else /* !KTR */
      
      void
      in6m_print(const struct in6_multi *inm)
      {
      
      }
      
      #endif /* KTR */
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1992, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)scanc.c        8.1 (Berkeley) 6/10/93
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/libkern.h>
      
      int
      scanc(u_int size, const u_char *cp, const u_char table[], int mask0)
 2916 {
              const u_char *end;
              u_char mask;
      
              mask = mask0;
 2916         for (end = &cp[size]; cp < end; ++cp) {
 2916                 if (table[*cp] & mask)
                              break;
              }
              return (end - cp);
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2008-2011 Robert N. M. Watson
       * Copyright (c) 2010-2011 Jonathan Anderson
       * Copyright (c) 2012 FreeBSD Foundation
       * All rights reserved.
       *
       * This software was developed at the University of Cambridge Computer
       * Laboratory with support from a grant from Google, Inc.
       *
       * Portions of this software were developed by Pawel Jakub Dawidek under
       * sponsorship from the FreeBSD Foundation.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      /*
       * FreeBSD kernel capability facility.
       *
       * Two kernel features are implemented here: capability mode, a sandboxed mode
       * of execution for processes, and capabilities, a refinement on file
       * descriptors that allows fine-grained control over operations on the file
       * descriptor.  Collectively, these allow processes to run in the style of a
       * historic "capability system" in which they can use only resources
       * explicitly delegated to them.  This model is enforced by restricting access
       * to global namespaces in capability mode.
       *
       * Capabilities wrap other file descriptor types, binding them to a constant
       * rights mask set when the capability is created.  New capabilities may be
       * derived from existing capabilities, but only if they have the same or a
       * strict subset of the rights on the original capability.
       *
       * System calls permitted in capability mode are defined in capabilities.conf;
       * calls must be carefully audited for safety to ensure that they don't allow
       * escape from a sandbox.  Some calls permit only a subset of operations in
       * capability mode -- for example, shm_open(2) is limited to creating
       * anonymous, rather than named, POSIX shared memory objects.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_capsicum.h"
      #include "opt_ktrace.h"
      
      #include <sys/param.h>
      #include <sys/capsicum.h>
      #include <sys/file.h>
      #include <sys/filedesc.h>
      #include <sys/kernel.h>
      #include <sys/limits.h>
      #include <sys/lock.h>
      #include <sys/mutex.h>
      #include <sys/proc.h>
      #include <sys/syscallsubr.h>
      #include <sys/sysproto.h>
      #include <sys/sysctl.h>
      #include <sys/systm.h>
      #include <sys/ucred.h>
      #include <sys/uio.h>
      #include <sys/ktrace.h>
      
      #include <security/audit/audit.h>
      
      #include <vm/uma.h>
      #include <vm/vm.h>
      
      bool __read_frequently trap_enotcap;
      SYSCTL_BOOL(_kern, OID_AUTO, trap_enotcap, CTLFLAG_RWTUN, &trap_enotcap, 0,
          "Deliver SIGTRAP on ENOTCAPABLE");
      
      #ifdef CAPABILITY_MODE
      
      #define        IOCTLS_MAX_COUNT        256     /* XXX: Is 256 sane? */
      
      FEATURE(security_capability_mode, "Capsicum Capability Mode");
      
      /*
       * System call to enter capability mode for the process.
       */
      int
      sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
    2 {
              struct ucred *newcred, *oldcred;
              struct proc *p;
      
    1         if (IN_CAPABILITY_MODE(td))
                      return (0);
      
    1         newcred = crget();
              p = td->td_proc;
              PROC_LOCK(p);
              oldcred = crcopysafe(p, newcred);
              newcred->cr_flags |= CRED_FLAG_CAPMODE;
              proc_set_cred(p, newcred);
              PROC_UNLOCK(p);
              crfree(oldcred);
              return (0);
      }
      
      /*
       * System call to query whether the process is in capability mode.
       */
      int
      sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
    2 {
              u_int i;
      
              i = IN_CAPABILITY_MODE(td) ? 1 : 0;
              return (copyout(&i, uap->modep, sizeof(i)));
      }
      
      #else /* !CAPABILITY_MODE */
      
      int
      sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
      {
      
              return (ENOSYS);
      }
      
      int
      sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
      {
      
              return (ENOSYS);
      }
      
      #endif /* CAPABILITY_MODE */
      
      #ifdef CAPABILITIES
      
      FEATURE(security_capabilities, "Capsicum Capabilities");
      
      MALLOC_DECLARE(M_FILECAPS);
      
      static inline int
      _cap_check(const cap_rights_t *havep, const cap_rights_t *needp,
          enum ktr_cap_fail_type type)
      {
      
 1362         if (!cap_rights_contains(havep, needp)) {
      #ifdef KTRACE
                      if (KTRPOINT(curthread, KTR_CAPFAIL))
                              ktrcapfail(type, needp, havep);
      #endif
                      return (ENOTCAPABLE);
              }
              return (0);
      }
      
      /*
       * Test whether a capability grants the requested rights.
       */
      int
      cap_check(const cap_rights_t *havep, const cap_rights_t *needp)
 1366 {
      
              return (_cap_check(havep, needp, CAPFAIL_NOTCAPABLE));
      }
      
      int
      cap_check_failed_notcapable(const cap_rights_t *havep, const cap_rights_t *needp)
      {
      
      #ifdef KTRACE
              if (KTRPOINT(curthread, KTR_CAPFAIL))
                      ktrcapfail(CAPFAIL_NOTCAPABLE, needp, havep);
      #endif
              return (ENOTCAPABLE);
      }
      
      /*
       * Convert capability rights into VM access flags.
       */
      vm_prot_t
      cap_rights_to_vmprot(const cap_rights_t *havep)
  175 {
              vm_prot_t maxprot;
      
              maxprot = VM_PROT_NONE;
              if (cap_rights_is_set(havep, CAP_MMAP_R))
                      maxprot |= VM_PROT_READ;
              if (cap_rights_is_set(havep, CAP_MMAP_W))
                      maxprot |= VM_PROT_WRITE;
              if (cap_rights_is_set(havep, CAP_MMAP_X))
                      maxprot |= VM_PROT_EXECUTE;
      
              return (maxprot);
      }
      
      /*
       * Extract rights from a capability for monitoring purposes -- not for use in
       * any other way, as we want to keep all capability permission evaluation in
       * this one file.
       */
      
      const cap_rights_t *
      cap_rights_fde(const struct filedescent *fdep)
      {
      
              return (cap_rights_fde_inline(fdep));
      }
      
      const cap_rights_t *
      cap_rights(struct filedesc *fdp, int fd)
 1523 {
      
              return (cap_rights_fde(&fdp->fd_ofiles[fd]));
      }
      
      int
      kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights)
      {
              struct filedesc *fdp;
              struct filedescent *fdep;
              u_long *ioctls;
              int error;
      
              fdp = td->td_proc->p_fd;
              FILEDESC_XLOCK(fdp);
              fdep = fdeget_locked(fdp, fd);
              if (fdep == NULL) {
                      FILEDESC_XUNLOCK(fdp);
                      return (EBADF);
              }
              ioctls = NULL;
              error = _cap_check(cap_rights(fdp, fd), rights, CAPFAIL_INCREASE);
              if (error == 0) {
                      seqc_write_begin(&fdep->fde_seqc);
                      fdep->fde_rights = *rights;
                      if (!cap_rights_is_set(rights, CAP_IOCTL)) {
                              ioctls = fdep->fde_ioctls;
                              fdep->fde_ioctls = NULL;
                              fdep->fde_nioctls = 0;
                      }
                      if (!cap_rights_is_set(rights, CAP_FCNTL))
                              fdep->fde_fcntls = 0;
                      seqc_write_end(&fdep->fde_seqc);
              }
              FILEDESC_XUNLOCK(fdp);
              free(ioctls, M_FILECAPS);
              return (error);
      }
      
      /*
       * System call to limit rights of the given capability.
       */
      int
      sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
    8 {
              cap_rights_t rights;
              int error, version;
      
              cap_rights_init_zero(&rights);
      
              error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0]));
    2         if (error != 0)
                      return (error);
              version = CAPVER(&rights);
    1         if (version != CAP_RIGHTS_VERSION_00)
                      return (EINVAL);
      
              error = copyin(uap->rightsp, &rights,
                  sizeof(rights.cr_rights[0]) * CAPARSIZE(&rights));
              if (error != 0)
                      return (error);
              /* Check for race. */
              if (CAPVER(&rights) != version)
                      return (EINVAL);
      
    5         if (!cap_rights_is_valid(&rights))
                      return (EINVAL);
      
              if (version != CAP_RIGHTS_VERSION) {
                      rights.cr_rights[0] &= ~(0x3ULL << 62);
                      rights.cr_rights[0] |= ((uint64_t)CAP_RIGHTS_VERSION << 62);
              }
      #ifdef KTRACE
              if (KTRPOINT(td, KTR_STRUCT))
                      ktrcaprights(&rights);
      #endif
      
              AUDIT_ARG_FD(uap->fd);
              AUDIT_ARG_RIGHTS(&rights);
              return (kern_cap_rights_limit(td, uap->fd, &rights));
      }
      
      /*
       * System call to query the rights mask associated with a capability.
       */
      int
      sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
    6 {
              struct filedesc *fdp;
              cap_rights_t rights;
              int error, fd, i, n;
      
    1         if (uap->version != CAP_RIGHTS_VERSION_00)
                      return (EINVAL);
      
              fd = uap->fd;
      
    5         AUDIT_ARG_FD(fd);
      
              fdp = td->td_proc->p_fd;
              FILEDESC_SLOCK(fdp);
    1         if (fget_locked(fdp, fd) == NULL) {
                      FILEDESC_SUNLOCK(fdp);
                      return (EBADF);
              }
              rights = *cap_rights(fdp, fd);
              FILEDESC_SUNLOCK(fdp);
              n = uap->version + 2;
    3         if (uap->version != CAPVER(&rights)) {
                      /*
                       * For older versions we need to check if the descriptor
                       * doesn't contain rights not understood by the caller.
                       * If it does, we have to return an error.
                       */
                      for (i = n; i < CAPARSIZE(&rights); i++) {
                              if ((rights.cr_rights[i] & ~(0x7FULL << 57)) != 0)
                                      return (EINVAL);
                      }
              }
              error = copyout(&rights, uap->rightsp, sizeof(rights.cr_rights[0]) * n);
      #ifdef KTRACE
    3         if (error == 0 && KTRPOINT(td, KTR_STRUCT))
                      ktrcaprights(&rights);
      #endif
              return (error);
      }
      
      /*
       * Test whether a capability grants the given ioctl command.
       * If descriptor doesn't have CAP_IOCTL, then ioctls list is empty and
       * ENOTCAPABLE will be returned.
       */
      int
      cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd)
   67 {
              struct filedescent *fdep;
              u_long *cmds;
              ssize_t ncmds;
              long i;
      
              KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
                      ("%s: invalid fd=%d", __func__, fd));
      
              fdep = fdeget_locked(fdp, fd);
              KASSERT(fdep != NULL,
                  ("%s: invalid fd=%d", __func__, fd));
      
              ncmds = fdep->fde_nioctls;
   65         if (ncmds == -1)
                      return (0);
      
              cmds = fdep->fde_ioctls;
    2         for (i = 0; i < ncmds; i++) {
                      if (cmds[i] == cmd)
                              return (0);
              }
      
              return (ENOTCAPABLE);
      }
      
      /*
       * Check if the current ioctls list can be replaced by the new one.
       */
      static int
      cap_ioctl_limit_check(struct filedescent *fdep, const u_long *cmds,
          size_t ncmds)
      {
              u_long *ocmds;
              ssize_t oncmds;
              u_long i;
              long j;
      
              oncmds = fdep->fde_nioctls;
              if (oncmds == -1)
                      return (0);
              if (oncmds < (ssize_t)ncmds)
                      return (ENOTCAPABLE);
      
              ocmds = fdep->fde_ioctls;
    1         for (i = 0; i < ncmds; i++) {
                      for (j = 0; j < oncmds; j++) {
                              if (cmds[i] == ocmds[j])
                                      break;
                      }
                      if (j == oncmds)
                              return (ENOTCAPABLE);
              }
      
              return (0);
      }
      
      int
      kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds)
    4 {
              struct filedesc *fdp;
              struct filedescent *fdep;
              u_long *ocmds;
              int error;
      
    4         AUDIT_ARG_FD(fd);
      
              if (ncmds > IOCTLS_MAX_COUNT) {
                      error = EINVAL;
                      goto out_free;
              }
      
              fdp = td->td_proc->p_fd;
              FILEDESC_XLOCK(fdp);
      
              fdep = fdeget_locked(fdp, fd);
              if (fdep == NULL) {
                      error = EBADF;
                      goto out;
              }
      
              error = cap_ioctl_limit_check(fdep, cmds, ncmds);
              if (error != 0)
                      goto out;
      
              ocmds = fdep->fde_ioctls;
              seqc_write_begin(&fdep->fde_seqc);
              fdep->fde_ioctls = cmds;
              fdep->fde_nioctls = ncmds;
              seqc_write_end(&fdep->fde_seqc);
      
              cmds = ocmds;
              error = 0;
      out:
              FILEDESC_XUNLOCK(fdp);
      out_free:
              free(cmds, M_FILECAPS);
              return (error);
      }
      
      int
      sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
    4 {
              u_long *cmds;
              size_t ncmds;
              int error;
      
              ncmds = uap->ncmds;
      
              if (ncmds > IOCTLS_MAX_COUNT)
                      return (EINVAL);
      
    4         if (ncmds == 0) {
                      cmds = NULL;
              } else {
                      cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK);
                      error = copyin(uap->cmds, cmds, sizeof(cmds[0]) * ncmds);
                      if (error != 0) {
                              free(cmds, M_FILECAPS);
                              return (error);
                      }
              }
      
              return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds));
      }
      
      int
      sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
    5 {
              struct filedesc *fdp;
              struct filedescent *fdep;
              u_long *cmdsp, *dstcmds;
              size_t maxcmds, ncmds;
              int16_t count;
              int error, fd;
      
              fd = uap->fd;
              dstcmds = uap->cmds;
              maxcmds = uap->maxcmds;
      
    5         AUDIT_ARG_FD(fd);
      
              fdp = td->td_proc->p_fd;
      
              cmdsp = NULL;
    3         if (dstcmds != NULL) {
                      cmdsp = malloc(sizeof(cmdsp[0]) * IOCTLS_MAX_COUNT, M_FILECAPS,
                          M_WAITOK | M_ZERO);
              }
      
              FILEDESC_SLOCK(fdp);
              fdep = fdeget_locked(fdp, fd);
              if (fdep == NULL) {
                      error = EBADF;
                      FILEDESC_SUNLOCK(fdp);
                      goto out;
              }
              count = fdep->fde_nioctls;
    2         if (count != -1 && cmdsp != NULL) {
    1                 ncmds = MIN(count, maxcmds);
                      memcpy(cmdsp, fdep->fde_ioctls, sizeof(cmdsp[0]) * ncmds);
              }
              FILEDESC_SUNLOCK(fdp);
      
              /*
               * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL)
               * the only sane thing we can do is to not populate the given array and
               * return CAP_IOCTLS_ALL.
               */
              if (count != -1) {
    1                 if (cmdsp != NULL) {
                              error = copyout(cmdsp, dstcmds,
                                  sizeof(cmdsp[0]) * ncmds);
    1                         if (error != 0)
                                      goto out;
                      }
                      td->td_retval[0] = count;
              } else {
    1                 td->td_retval[0] = CAP_IOCTLS_ALL;
              }
      
              error = 0;
      out:
              free(cmdsp, M_FILECAPS);
              return (error);
      }
      
      /*
       * Test whether a capability grants the given fcntl command.
       */
      int
      cap_fcntl_check_fde(struct filedescent *fdep, int cmd)
      {
              uint32_t fcntlcap;
      
              fcntlcap = (1 << cmd);
              KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0,
                  ("Unsupported fcntl=%d.", cmd));
      
              if ((fdep->fde_fcntls & fcntlcap) != 0)
                      return (0);
      
              return (ENOTCAPABLE);
      }
      
      int
      cap_fcntl_check(struct filedesc *fdp, int fd, int cmd)
   30 {
      
              KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
                  ("%s: invalid fd=%d", __func__, fd));
      
   30         return (cap_fcntl_check_fde(&fdp->fd_ofiles[fd], cmd));
      }
      
      int
      sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
    4 {
              struct filedesc *fdp;
              struct filedescent *fdep;
              uint32_t fcntlrights;
              int fd;
      
              fd = uap->fd;
              fcntlrights = uap->fcntlrights;
      
    4         AUDIT_ARG_FD(fd);
              AUDIT_ARG_FCNTL_RIGHTS(fcntlrights);
      
    1         if ((fcntlrights & ~CAP_FCNTL_ALL) != 0)
                      return (EINVAL);
      
              fdp = td->td_proc->p_fd;
              FILEDESC_XLOCK(fdp);
      
              fdep = fdeget_locked(fdp, fd);
              if (fdep == NULL) {
                      FILEDESC_XUNLOCK(fdp);
                      return (EBADF);
              }
      
              if ((fcntlrights & ~fdep->fde_fcntls) != 0) {
                      FILEDESC_XUNLOCK(fdp);
                      return (ENOTCAPABLE);
              }
      
              seqc_write_begin(&fdep->fde_seqc);
              fdep->fde_fcntls = fcntlrights;
              seqc_write_end(&fdep->fde_seqc);
              FILEDESC_XUNLOCK(fdp);
      
              return (0);
      }
      
      int
      sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
    4 {
              struct filedesc *fdp;
              struct filedescent *fdep;
              uint32_t rights;
              int fd;
      
              fd = uap->fd;
      
    4         AUDIT_ARG_FD(fd);
      
              fdp = td->td_proc->p_fd;
              FILEDESC_SLOCK(fdp);
              fdep = fdeget_locked(fdp, fd);
              if (fdep == NULL) {
                      FILEDESC_SUNLOCK(fdp);
                      return (EBADF);
              }
    2         rights = fdep->fde_fcntls;
              FILEDESC_SUNLOCK(fdp);
      
              return (copyout(&rights, uap->fcntlrightsp, sizeof(rights)));
      }
      
      #else /* !CAPABILITIES */
      
      /*
       * Stub Capability functions for when options CAPABILITIES isn't compiled
       * into the kernel.
       */
      
      int
      sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
      {
      
              return (ENOSYS);
      }
      
      int
      sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
      {
      
              return (ENOSYS);
      }
      
      int
      sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
      {
      
              return (ENOSYS);
      }
      
      int
      sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
      {
      
              return (ENOSYS);
      }
      
      int
      sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
      {
      
              return (ENOSYS);
      }
      
      int
      sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
      {
      
              return (ENOSYS);
      }
      
      #endif /* CAPABILITIES */
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2010,2013 Lawrence Stewart <lstewart@freebsd.org>
       * Copyright (c) 2010 The FreeBSD Foundation
       * All rights reserved.
       *
       * This software was developed by Lawrence Stewart while studying at the Centre
       * for Advanced Internet Architectures, Swinburne University of Technology,
       * made possible in part by grants from the FreeBSD Foundation and Cisco
       * University Research Program Fund at Community Foundation Silicon Valley.
       *
       * Portions of this software were developed at the Centre for Advanced
       * Internet Architectures, Swinburne University of Technology, Melbourne,
       * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/kernel.h>
      #include <sys/hhook.h>
      #include <sys/khelp.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/module.h>
      #include <sys/module_khelp.h>
      #include <sys/osd.h>
      #include <sys/queue.h>
      #include <sys/refcount.h>
      #include <sys/rwlock.h>
      #include <sys/systm.h>
      
      static struct rwlock khelp_list_lock;
      RW_SYSINIT(khelplistlock, &khelp_list_lock, "helper list lock");
      
      static TAILQ_HEAD(helper_head, helper) helpers = TAILQ_HEAD_INITIALIZER(helpers);
      
      /* Private function prototypes. */
      static inline void khelp_remove_osd(struct helper *h, struct osd *hosd);
      void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
      
      #define        KHELP_LIST_WLOCK() rw_wlock(&khelp_list_lock)
      #define        KHELP_LIST_WUNLOCK() rw_wunlock(&khelp_list_lock)
      #define        KHELP_LIST_RLOCK() rw_rlock(&khelp_list_lock)
      #define        KHELP_LIST_RUNLOCK() rw_runlock(&khelp_list_lock)
      #define        KHELP_LIST_LOCK_ASSERT() rw_assert(&khelp_list_lock, RA_LOCKED)
      
      int
      khelp_register_helper(struct helper *h)
      {
              struct helper *tmph;
              int error, i, inserted;
      
              error = inserted = 0;
              refcount_init(&h->h_refcount, 0);
              h->h_id = osd_register(OSD_KHELP, NULL, NULL);
      
              /* It's only safe to add the hooks after osd_register(). */
              for (i = 0; i < h->h_nhooks && !error; i++) {
                      /* We don't require the module to assign hook_helper. */
                      h->h_hooks[i].hook_helper = h;
                      error = hhook_add_hook_lookup(&h->h_hooks[i], HHOOK_WAITOK);
                      if (error)
                              printf("%s: \"%s\" khelp module unable to "
                                  "hook type %d id %d due to error %d\n", __func__,
                                  h->h_name, h->h_hooks[i].hook_type,
                                  h->h_hooks[i].hook_id, error);
              }
      
              if (error) {
                      for (i--; i >= 0; i--)
                              hhook_remove_hook_lookup(&h->h_hooks[i]);
                      osd_deregister(OSD_KHELP, h->h_id);
              } else {
                      KHELP_LIST_WLOCK();
                      /*
                       * Keep list of helpers sorted in descending h_id order. Due to
                       * the way osd_set() works, a sorted list ensures
                       * khelp_init_osd() will operate with improved efficiency.
                       */
                      TAILQ_FOREACH(tmph, &helpers, h_next) {
                              if (tmph->h_id < h->h_id) {
                                      TAILQ_INSERT_BEFORE(tmph, h, h_next);
                                      inserted = 1;
                                      break;
                              }
                      }
      
                      if (!inserted)
                              TAILQ_INSERT_TAIL(&helpers, h, h_next);
                      KHELP_LIST_WUNLOCK();
              }
      
              return (error);
      }
      
      int
      khelp_deregister_helper(struct helper *h)
      {
              struct helper *tmph;
              int error, i;
      
              KHELP_LIST_WLOCK();
              if (h->h_refcount > 0)
                      error = EBUSY;
              else {
                      error = ENOENT;
                      TAILQ_FOREACH(tmph, &helpers, h_next) {
                              if (tmph == h) {
                                      TAILQ_REMOVE(&helpers, h, h_next);
                                      error = 0;
                                      break;
                              }
                      }
              }
              KHELP_LIST_WUNLOCK();
      
              if (!error) {
                      for (i = 0; i < h->h_nhooks; i++)
                              hhook_remove_hook_lookup(&h->h_hooks[i]);
                      osd_deregister(OSD_KHELP, h->h_id);
              }
      
              return (error);
      }
      
      int
      khelp_init_osd(uint32_t classes, struct osd *hosd)
  811 {
              struct helper *h;
              void *hdata;
              int error;
      
              KASSERT(hosd != NULL, ("struct osd not initialised!"));
      
              error = 0;
      
              KHELP_LIST_RLOCK();
  811         TAILQ_FOREACH(h, &helpers, h_next) {
                      /* If helper is correct class and needs to store OSD... */
                      if (h->h_classes & classes && h->h_flags & HELPER_NEEDS_OSD) {
                              hdata = uma_zalloc(h->h_zone, M_NOWAIT);
                              if (hdata == NULL) {
                                      error = ENOMEM;
                                      break;
                              }
                              osd_set(OSD_KHELP, hosd, h->h_id, hdata);
                              refcount_acquire(&h->h_refcount);
                      }
              }
      
              if (error) {
                      /* Delete OSD that was assigned prior to the error. */
                      TAILQ_FOREACH(h, &helpers, h_next) {
                              if (h->h_classes & classes)
                                      khelp_remove_osd(h, hosd);
                      }
              }
              KHELP_LIST_RUNLOCK();
      
              return (error);
      }
      
      int
      khelp_destroy_osd(struct osd *hosd)
  189 {
              struct helper *h;
              int error;
      
              KASSERT(hosd != NULL, ("struct osd not initialised!"));
      
              error = 0;
      
              KHELP_LIST_RLOCK();
              /*
               * Clean up all khelp related OSD.
               *
               * XXXLAS: Would be nice to use something like osd_exit() here but it
               * doesn't have the right semantics for this purpose.
               */
  189         TAILQ_FOREACH(h, &helpers, h_next)
                      khelp_remove_osd(h, hosd);
              KHELP_LIST_RUNLOCK();
      
              return (error);
      }
      
      static inline void
      khelp_remove_osd(struct helper *h, struct osd *hosd)
      {
              void *hdata;
      
              if (h->h_flags & HELPER_NEEDS_OSD) {
                      /*
                       * If the current helper uses OSD and calling osd_get()
                       * on the helper's h_id returns non-NULL, the helper has
                       * OSD attached to 'hosd' which needs to be cleaned up.
                       */
                      hdata = osd_get(OSD_KHELP, hosd, h->h_id);
                      if (hdata != NULL) {
                              uma_zfree(h->h_zone, hdata);
                              osd_del(OSD_KHELP, hosd, h->h_id);
                              refcount_release(&h->h_refcount);
                      }
              }
      }
      
      void *
      khelp_get_osd(struct osd *hosd, int32_t id)
      {
      
              return (osd_get(OSD_KHELP, hosd, id));
      }
      
      int32_t
      khelp_get_id(char *hname)
      {
              struct helper *h;
              int32_t id;
      
              id = -1;
      
              KHELP_LIST_RLOCK();
              TAILQ_FOREACH(h, &helpers, h_next) {
                      if (strncmp(h->h_name, hname, HELPER_NAME_MAXLEN) == 0) {
                              id = h->h_id;
                              break;
                      }
              }
              KHELP_LIST_RUNLOCK();
      
              return (id);
      }
      
      int
      khelp_add_hhook(struct hookinfo *hki, uint32_t flags)
      {
              int error;
      
              /*
               * XXXLAS: Should probably include the functionality to update the
               * helper's h_hooks struct member.
               */
              error = hhook_add_hook_lookup(hki, flags);
      
              return (error);
      }
      
      int
      khelp_remove_hhook(struct hookinfo *hki)
      {
              int error;
      
              /*
               * XXXLAS: Should probably include the functionality to update the
               * helper's h_hooks struct member.
               */
              error = hhook_remove_hook_lookup(hki);
      
              return (error);
      }
      
      /*
       * Private KPI between hhook and khelp that allows khelp modules to insert hook
       * functions into hhook points which register after the modules were loaded.
       */
      void
      khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags)
      {
              struct helper *h;
              int error, i;
      
              KHELP_LIST_RLOCK();
              TAILQ_FOREACH(h, &helpers, h_next) {
                      for (i = 0; i < h->h_nhooks; i++) {
                              if (hhh->hhh_type != h->h_hooks[i].hook_type ||
                                  hhh->hhh_id != h->h_hooks[i].hook_id)
                                      continue;
                              error = hhook_add_hook(hhh, &h->h_hooks[i], flags);
                              if (error) {
                                      printf("%s: \"%s\" khelp module unable to "
                                          "hook type %d id %d due to error %d\n",
                                          __func__, h->h_name,
                                          h->h_hooks[i].hook_type,
                                          h->h_hooks[i].hook_id, error);
                                      error = 0;
                              }
                      }
              }
              KHELP_LIST_RUNLOCK();
      }
      
      int
      khelp_modevent(module_t mod, int event_type, void *data)
      {
              struct khelp_modevent_data *kmd;
              int error;
      
              kmd = (struct khelp_modevent_data *)data;
              error = 0;
      
              switch(event_type) {
              case MOD_LOAD:
                      if (kmd->helper->h_flags & HELPER_NEEDS_OSD) {
                              if (kmd->uma_zsize <= 0) {
                                      printf("Use KHELP_DECLARE_MOD_UMA() instead!\n");
                                      error = EDOOFUS;
                                      break;
                              }
                              kmd->helper->h_zone = uma_zcreate(kmd->name,
                                  kmd->uma_zsize, kmd->umactor, kmd->umadtor, NULL,
                                  NULL, 0, 0);
                              if (kmd->helper->h_zone == NULL) {
                                      error = ENOMEM;
                                      break;
                              }
                      }
                      strlcpy(kmd->helper->h_name, kmd->name, HELPER_NAME_MAXLEN);
                      kmd->helper->h_hooks = kmd->hooks;
                      kmd->helper->h_nhooks = kmd->nhooks;
                      if (kmd->helper->mod_init != NULL)
                              error = kmd->helper->mod_init();
                      if (!error)
                              error = khelp_register_helper(kmd->helper);
                      break;
      
              case MOD_QUIESCE:
              case MOD_SHUTDOWN:
              case MOD_UNLOAD:
                      error = khelp_deregister_helper(kmd->helper);
                      if (!error) {
                              if (kmd->helper->h_flags & HELPER_NEEDS_OSD)
                                      uma_zdestroy(kmd->helper->h_zone);
                              if (kmd->helper->mod_destroy != NULL)
                                      kmd->helper->mod_destroy();
                      } else if (error == ENOENT)
                              /* Do nothing and allow unload if helper not in list. */
                              error = 0;
                      else if (error == EBUSY)
                              printf("Khelp module \"%s\" can't unload until its "
                                  "refcount drops from %d to 0.\n", kmd->name,
                                  kmd->helper->h_refcount);
                      break;
      
              default:
                      error = EINVAL;
                      break;
              }
      
              return (error);
      }
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1982, 1986, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)uipc_domain.c        8.2 (Berkeley) 10/18/93
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/socket.h>
      #include <sys/protosw.h>
      #include <sys/domain.h>
      #include <sys/eventhandler.h>
      #include <sys/epoch.h>
      #include <sys/mbuf.h>
      #include <sys/kernel.h>
      #include <sys/lock.h>
      #include <sys/mutex.h>
      #include <sys/socketvar.h>
      #include <sys/systm.h>
      
      #include <net/vnet.h>
      
      /*
       * System initialization
       *
       * Note: domain initialization takes place on a per domain basis
       * as a result of traversing a SYSINIT linker set.  Most likely,
       * each domain would want to call DOMAIN_SET(9) itself, which
       * would cause the domain to be added just after domaininit()
       * is called during startup.
       *
       * See DOMAIN_SET(9) for details on its use.
       */
      
      static void domaininit(void *);
      SYSINIT(domain, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, domaininit, NULL);
      
      static void domainfinalize(void *);
      SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize,
          NULL);
      
      static struct callout pffast_callout;
      static struct callout pfslow_callout;
      
      static void        pffasttimo(void *);
      static void        pfslowtimo(void *);
      
      struct domain *domains;                /* registered protocol domains */
      int domain_init_status = 0;
      static struct mtx dom_mtx;                /* domain list lock */
      MTX_SYSINIT(domain, &dom_mtx, "domain list", MTX_DEF);
      
      /*
       * Dummy protocol specific user requests function pointer array.
       * All functions return EOPNOTSUPP.
       */
      struct pr_usrreqs nousrreqs = {
              .pru_accept =                pru_accept_notsupp,
              .pru_attach =                pru_attach_notsupp,
              .pru_bind =                pru_bind_notsupp,
              .pru_connect =                pru_connect_notsupp,
              .pru_connect2 =                pru_connect2_notsupp,
              .pru_control =                pru_control_notsupp,
              .pru_disconnect        =        pru_disconnect_notsupp,
              .pru_listen =                pru_listen_notsupp,
              .pru_peeraddr =                pru_peeraddr_notsupp,
              .pru_rcvd =                pru_rcvd_notsupp,
              .pru_rcvoob =                pru_rcvoob_notsupp,
              .pru_send =                pru_send_notsupp,
              .pru_sense =                pru_sense_null,
              .pru_shutdown =                pru_shutdown_notsupp,
              .pru_sockaddr =                pru_sockaddr_notsupp,
              .pru_sosend =                pru_sosend_notsupp,
              .pru_soreceive =        pru_soreceive_notsupp,
              .pru_sopoll =                pru_sopoll_notsupp,
      };
      
      static void
      protosw_init(struct protosw *pr)
      {
              struct pr_usrreqs *pu;
      
              pu = pr->pr_usrreqs;
              KASSERT(pu != NULL, ("protosw_init: %ssw[%d] has no usrreqs!",
                  pr->pr_domain->dom_name,
                  (int)(pr - pr->pr_domain->dom_protosw)));
      
              /*
               * Protocol switch methods fall into three categories: mandatory,
               * mandatory but protosw_init() provides a default, and optional.
               *
               * For true protocols (i.e., pru_attach != NULL), KASSERT truly
               * mandatory methods with no defaults, and initialize defaults for
               * other mandatory methods if the protocol hasn't defined an
               * implementation (NULL function pointer).
               */
      #if 0
              if (pu->pru_attach != NULL) {
                      KASSERT(pu->pru_abort != NULL,
                          ("protosw_init: %ssw[%d] pru_abort NULL",
                          pr->pr_domain->dom_name,
                          (int)(pr - pr->pr_domain->dom_protosw)));
                      KASSERT(pu->pru_send != NULL,
                          ("protosw_init: %ssw[%d] pru_send NULL",
                          pr->pr_domain->dom_name,
                          (int)(pr - pr->pr_domain->dom_protosw)));
              }
      #endif
      
      #define DEFAULT(foo, bar)        if ((foo) == NULL)  (foo) = (bar)
              DEFAULT(pu->pru_accept, pru_accept_notsupp);
              DEFAULT(pu->pru_aio_queue, pru_aio_queue_notsupp);
              DEFAULT(pu->pru_bind, pru_bind_notsupp);
              DEFAULT(pu->pru_bindat, pru_bindat_notsupp);
              DEFAULT(pu->pru_connect, pru_connect_notsupp);
              DEFAULT(pu->pru_connect2, pru_connect2_notsupp);
              DEFAULT(pu->pru_connectat, pru_connectat_notsupp);
              DEFAULT(pu->pru_control, pru_control_notsupp);
              DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp);
              DEFAULT(pu->pru_listen, pru_listen_notsupp);
              DEFAULT(pu->pru_peeraddr, pru_peeraddr_notsupp);
              DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp);
              DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp);
              DEFAULT(pu->pru_sense, pru_sense_null);
              DEFAULT(pu->pru_shutdown, pru_shutdown_notsupp);
              DEFAULT(pu->pru_sockaddr, pru_sockaddr_notsupp);
              DEFAULT(pu->pru_sosend, sosend_generic);
              DEFAULT(pu->pru_soreceive, soreceive_generic);
              DEFAULT(pu->pru_sopoll, sopoll_generic);
              DEFAULT(pu->pru_ready, pru_ready_notsupp);
      #undef DEFAULT
              if (pr->pr_init)
                      (*pr->pr_init)();
      }
      
      /*
       * Add a new protocol domain to the list of supported domains
       * Note: you cant unload it again because a socket may be using it.
       * XXX can't fail at this time.
       */
      void
      domain_init(void *arg)
      {
              struct domain *dp = arg;
              struct protosw *pr;
      
              if (dp->dom_init)
                      (*dp->dom_init)();
              for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                      protosw_init(pr);
              /*
               * update global information about maximums
               */
              max_hdr = max_linkhdr + max_protohdr;
              max_datalen = MHLEN - max_hdr;
              if (max_datalen < 1)
                      panic("%s: max_datalen < 1", __func__);
      }
      
      #ifdef VIMAGE
      void
      vnet_domain_init(void *arg)
      {
      
              /* Virtualized case is no different -- call init functions. */
              domain_init(arg);
      }
      
      void
      vnet_domain_uninit(void *arg)
      {
              struct domain *dp = arg;
      
              if (dp->dom_destroy)
                      (*dp->dom_destroy)();
      }
      #endif
      
      /*
       * Add a new protocol domain to the list of supported domains
       * Note: you cant unload it again because a socket may be using it.
       * XXX can't fail at this time.
       */
      void
      domain_add(void *data)
      {
              struct domain *dp;
      
              dp = (struct domain *)data;
              mtx_lock(&dom_mtx);
              dp->dom_next = domains;
              domains = dp;
      
              KASSERT(domain_init_status >= 1,
                  ("attempt to domain_add(%s) before domaininit()",
                  dp->dom_name));
      #ifndef INVARIANTS
              if (domain_init_status < 1)
                      printf("WARNING: attempt to domain_add(%s) before "
                          "domaininit()\n", dp->dom_name);
      #endif
      #ifdef notyet
              KASSERT(domain_init_status < 2,
                  ("attempt to domain_add(%s) after domainfinalize()",
                  dp->dom_name));
      #else
              if (domain_init_status >= 2)
                      printf("WARNING: attempt to domain_add(%s) after "
                          "domainfinalize()\n", dp->dom_name);
      #endif
              mtx_unlock(&dom_mtx);
      }
      
      /* ARGSUSED*/
      static void
      domaininit(void *dummy)
      {
      
              if (max_linkhdr < 16)                /* XXX */
                      max_linkhdr = 16;
      
              callout_init(&pffast_callout, 1);
              callout_init(&pfslow_callout, 1);
      
              mtx_lock(&dom_mtx);
              KASSERT(domain_init_status == 0, ("domaininit called too late!"));
              domain_init_status = 1;
              mtx_unlock(&dom_mtx);
      }
      
      /* ARGSUSED*/
      static void
      domainfinalize(void *dummy)
      {
      
              mtx_lock(&dom_mtx);
              KASSERT(domain_init_status == 1, ("domainfinalize called too late!"));
              domain_init_status = 2;
              mtx_unlock(&dom_mtx);        
      
              callout_reset(&pffast_callout, 1, pffasttimo, NULL);
              callout_reset(&pfslow_callout, 1, pfslowtimo, NULL);
      }
      
      struct domain *
      pffinddomain(int family)
   25 {
              struct domain *dp;
      
  761         for (dp = domains; dp != NULL; dp = dp->dom_next)
   18                 if (dp->dom_family == family)
                              return (dp);
              return (NULL);
      }
      
      struct protosw *
      pffindtype(int family, int type)
  199 {
              struct domain *dp;
              struct protosw *pr;
      
              dp = pffinddomain(family);
              if (dp == NULL)
                      return (NULL);
      
  147         for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
  188                 if (pr->pr_type && pr->pr_type == type)
                              return (pr);
              return (NULL);
      }
      
      struct protosw *
      pffindproto(int family, int protocol, int type)
  634 {
              struct domain *dp;
              struct protosw *pr;
              struct protosw *maybe;
      
              maybe = NULL;
    1         if (family == 0)
                      return (NULL);
      
              dp = pffinddomain(family);
              if (dp == NULL)
                      return (NULL);
      
  633         for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
  637                 if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
                              return (pr);
      
  627                 if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
   32                     pr->pr_protocol == 0 && maybe == NULL)
                              maybe = pr;
              }
              return (maybe);
      }
      
      /*
       * The caller must make sure that the new protocol is fully set up and ready to
       * accept requests before it is registered.
       */
      int
      pf_proto_register(int family, struct protosw *npr)
      {
              VNET_ITERATOR_DECL(vnet_iter);
              struct domain *dp;
              struct protosw *pr, *fpr;
      
              /* Sanity checks. */
              if (family == 0)
                      return (EPFNOSUPPORT);
              if (npr->pr_type == 0)
                      return (EPROTOTYPE);
              if (npr->pr_protocol == 0)
                      return (EPROTONOSUPPORT);
              if (npr->pr_usrreqs == NULL)
                      return (ENXIO);
      
              /* Try to find the specified domain based on the family. */
              dp = pffinddomain(family);
              if (dp == NULL)
                      return (EPFNOSUPPORT);
      
              /* Initialize backpointer to struct domain. */
              npr->pr_domain = dp;
              fpr = NULL;
      
              /*
               * Protect us against races when two protocol registrations for
               * the same protocol happen at the same time.
               */
              mtx_lock(&dom_mtx);
      
              /* The new protocol must not yet exist. */
              for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                      if ((pr->pr_type == npr->pr_type) &&
                          (pr->pr_protocol == npr->pr_protocol)) {
                              mtx_unlock(&dom_mtx);
                              return (EEXIST);        /* XXX: Check only protocol? */
                      }
                      /* While here, remember the first free spacer. */
                      if ((fpr == NULL) && (pr->pr_protocol == PROTO_SPACER))
                              fpr = pr;
              }
      
              /* If no free spacer is found we can't add the new protocol. */
              if (fpr == NULL) {
                      mtx_unlock(&dom_mtx);
                      return (ENOMEM);
              }
      
              /* Copy the new struct protosw over the spacer. */
              bcopy(npr, fpr, sizeof(*fpr));
      
              /* Job is done, no more protection required. */
              mtx_unlock(&dom_mtx);
      
              /* Initialize and activate the protocol. */
              VNET_LIST_RLOCK();
              VNET_FOREACH(vnet_iter) {
                      CURVNET_SET_QUIET(vnet_iter);
                      protosw_init(fpr);
                      CURVNET_RESTORE();
              }
              VNET_LIST_RUNLOCK();
      
              return (0);
      }
      
      /*
       * The caller must make sure the protocol and its functions correctly shut down
       * all sockets and release all locks and memory references.
       */
      int
      pf_proto_unregister(int family, int protocol, int type)
      {
              struct domain *dp;
              struct protosw *pr, *dpr;
      
              /* Sanity checks. */
              if (family == 0)
                      return (EPFNOSUPPORT);
              if (protocol == 0)
                      return (EPROTONOSUPPORT);
              if (type == 0)
                      return (EPROTOTYPE);
      
              /* Try to find the specified domain based on the family type. */
              dp = pffinddomain(family);
              if (dp == NULL)
                      return (EPFNOSUPPORT);
      
              dpr = NULL;
      
              /* Lock out everyone else while we are manipulating the protosw. */
              mtx_lock(&dom_mtx);
      
              /* The protocol must exist and only once. */
              for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                      if ((pr->pr_type == type) && (pr->pr_protocol == protocol)) {
                              if (dpr != NULL) {
                                      mtx_unlock(&dom_mtx);
                                      return (EMLINK);   /* Should not happen! */
                              } else
                                      dpr = pr;
                      }
              }
      
              /* Protocol does not exist. */
              if (dpr == NULL) {
                      mtx_unlock(&dom_mtx);
                      return (EPROTONOSUPPORT);
              }
      
              /* De-orbit the protocol and make the slot available again. */
              dpr->pr_type = 0;
              dpr->pr_domain = dp;
              dpr->pr_protocol = PROTO_SPACER;
              dpr->pr_flags = 0;
              dpr->pr_input = NULL;
              dpr->pr_output = NULL;
              dpr->pr_ctlinput = NULL;
              dpr->pr_ctloutput = NULL;
              dpr->pr_init = NULL;
              dpr->pr_fasttimo = NULL;
              dpr->pr_slowtimo = NULL;
              dpr->pr_drain = NULL;
              dpr->pr_usrreqs = &nousrreqs;
      
              /* Job is done, not more protection required. */
              mtx_unlock(&dom_mtx);
      
              return (0);
      }
      
      void
      pfctlinput(int cmd, struct sockaddr *sa)
      {
              struct domain *dp;
              struct protosw *pr;
      
              for (dp = domains; dp; dp = dp->dom_next)
                      for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                              if (pr->pr_ctlinput)
                                      (*pr->pr_ctlinput)(cmd, sa, (void *)0);
      }
      
      static void
      pfslowtimo(void *arg)
      {
              struct epoch_tracker et;
              struct domain *dp;
              struct protosw *pr;
      
              NET_EPOCH_ENTER(et);
              for (dp = domains; dp; dp = dp->dom_next)
                      for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                              if (pr->pr_slowtimo)
                                      (*pr->pr_slowtimo)();
              NET_EPOCH_EXIT(et);
              callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL);
      }
      
      static void
      pffasttimo(void *arg)
      {
              struct epoch_tracker et;
              struct domain *dp;
              struct protosw *pr;
      
              NET_EPOCH_ENTER(et);
              for (dp = domains; dp; dp = dp->dom_next)
                      for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                              if (pr->pr_fasttimo)
                                      (*pr->pr_fasttimo)();
              NET_EPOCH_EXIT(et);
              callout_reset(&pffast_callout, hz/5, pffasttimo, NULL);
      }
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1982, 1986, 1989, 1991, 1993
       *        The Regents of the University of California.  All rights reserved.
       * (c) UNIX System Laboratories, Inc.
       * All or some portions of this file are derived from material licensed
       * to the University of California by American Telephone and Telegraph
       * Co. or Unix System Laboratories, Inc. and are reproduced herein with
       * the permission of UNIX System Laboratories, Inc.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)kern_descrip.c        8.6 (Berkeley) 4/19/94
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_capsicum.h"
      #include "opt_ddb.h"
      #include "opt_ktrace.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      
      #include <sys/capsicum.h>
      #include <sys/conf.h>
      #include <sys/fcntl.h>
      #include <sys/file.h>
      #include <sys/filedesc.h>
      #include <sys/filio.h>
      #include <sys/jail.h>
      #include <sys/kernel.h>
      #include <sys/limits.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mount.h>
      #include <sys/mutex.h>
      #include <sys/namei.h>
      #include <sys/selinfo.h>
      #include <sys/priv.h>
      #include <sys/proc.h>
      #include <sys/protosw.h>
      #include <sys/racct.h>
      #include <sys/resourcevar.h>
      #include <sys/sbuf.h>
      #include <sys/signalvar.h>
      #include <sys/kdb.h>
      #include <sys/smr.h>
      #include <sys/stat.h>
      #include <sys/sx.h>
      #include <sys/syscallsubr.h>
      #include <sys/sysctl.h>
      #include <sys/sysproto.h>
      #include <sys/unistd.h>
      #include <sys/user.h>
      #include <sys/vnode.h>
      #ifdef KTRACE
      #include <sys/ktrace.h>
      #endif
      
      #include <net/vnet.h>
      
      #include <security/audit/audit.h>
      
      #include <vm/uma.h>
      #include <vm/vm.h>
      
      #include <ddb/ddb.h>
      
      static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
      static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
      static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
          "file desc to leader structures");
      static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
      MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
      
      MALLOC_DECLARE(M_FADVISE);
      
      static __read_mostly uma_zone_t file_zone;
      static __read_mostly uma_zone_t filedesc0_zone;
      static __read_mostly uma_zone_t pwd_zone;
      static __read_mostly smr_t pwd_smr;
      
      static int        closefp(struct filedesc *fdp, int fd, struct file *fp,
                          struct thread *td, int holdleaders);
      static int        fd_first_free(struct filedesc *fdp, int low, int size);
      static int        fd_last_used(struct filedesc *fdp, int size);
      static void        fdgrowtable(struct filedesc *fdp, int nfd);
      static void        fdgrowtable_exp(struct filedesc *fdp, int nfd);
      static void        fdunused(struct filedesc *fdp, int fd);
      static void        fdused(struct filedesc *fdp, int fd);
      static int        getmaxfd(struct thread *td);
      static u_long        *filecaps_copy_prep(const struct filecaps *src);
      static void        filecaps_copy_finish(const struct filecaps *src,
                          struct filecaps *dst, u_long *ioctls);
      static u_long         *filecaps_free_prep(struct filecaps *fcaps);
      static void        filecaps_free_finish(u_long *ioctls);
      
      static struct pwd *pwd_alloc(void);
      
      /*
       * Each process has:
       *
       * - An array of open file descriptors (fd_ofiles)
       * - An array of file flags (fd_ofileflags)
       * - A bitmap recording which descriptors are in use (fd_map)
       *
       * A process starts out with NDFILE descriptors.  The value of NDFILE has
       * been selected based the historical limit of 20 open files, and an
       * assumption that the majority of processes, especially short-lived
       * processes like shells, will never need more.
       *
       * If this initial allocation is exhausted, a larger descriptor table and
       * map are allocated dynamically, and the pointers in the process's struct
       * filedesc are updated to point to those.  This is repeated every time
       * the process runs out of file descriptors (provided it hasn't hit its
       * resource limit).
       *
       * Since threads may hold references to individual descriptor table
       * entries, the tables are never freed.  Instead, they are placed on a
       * linked list and freed only when the struct filedesc is released.
       */
      #define NDFILE                20
      #define NDSLOTSIZE        sizeof(NDSLOTTYPE)
      #define        NDENTRIES        (NDSLOTSIZE * __CHAR_BIT)
      #define NDSLOT(x)        ((x) / NDENTRIES)
      #define NDBIT(x)        ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
      #define        NDSLOTS(x)        (((x) + NDENTRIES - 1) / NDENTRIES)
      
      /*
       * SLIST entry used to keep track of ofiles which must be reclaimed when
       * the process exits.
       */
      struct freetable {
              struct fdescenttbl *ft_table;
              SLIST_ENTRY(freetable) ft_next;
      };
      
      /*
       * Initial allocation: a filedesc structure + the head of SLIST used to
       * keep track of old ofiles + enough space for NDFILE descriptors.
       */
      
      struct fdescenttbl0 {
              int        fdt_nfiles;
              struct        filedescent fdt_ofiles[NDFILE];
      };
      
      struct filedesc0 {
              struct filedesc fd_fd;
              SLIST_HEAD(, freetable) fd_free;
              struct        fdescenttbl0 fd_dfiles;
              NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
      };
      
      /*
       * Descriptor management.
       */
      static int __exclusive_cache_line openfiles; /* actual number of open files */
      struct mtx sigio_lock;                /* mtx to protect pointers to sigio */
      void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
      
      /*
       * If low >= size, just return low. Otherwise find the first zero bit in the
       * given bitmap, starting at low and not exceeding size - 1. Return size if
       * not found.
       */
      static int
      fd_first_free(struct filedesc *fdp, int low, int size)
      {
              NDSLOTTYPE *map = fdp->fd_map;
              NDSLOTTYPE mask;
              int off, maxoff;
      
              if (low >= size)
                      return (low);
      
              off = NDSLOT(low);
   35         if (low % NDENTRIES) {
                      mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
                      if ((mask &= ~map[off]) != 0UL)
                              return (off * NDENTRIES + ffsl(mask) - 1);
                      ++off;
              }
              for (maxoff = NDSLOTS(size); off < maxoff; ++off)
                      if (map[off] != ~0UL)
                              return (off * NDENTRIES + ffsl(~map[off]) - 1);
              return (size);
      }
      
      /*
       * Find the highest non-zero bit in the given bitmap, starting at 0 and
       * not exceeding size - 1. Return -1 if not found.
       */
      static int
      fd_last_used(struct filedesc *fdp, int size)
      {
              NDSLOTTYPE *map = fdp->fd_map;
              NDSLOTTYPE mask;
              int off, minoff;
      
              off = NDSLOT(size);
              if (size % NDENTRIES) {
                      mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
                      if ((mask &= map[off]) != 0)
                              return (off * NDENTRIES + flsl(mask) - 1);
                      --off;
              }
              for (minoff = NDSLOT(0); off >= minoff; --off)
                      if (map[off] != 0)
                              return (off * NDENTRIES + flsl(map[off]) - 1);
              return (-1);
      }
      
      static int
      fdisused(struct filedesc *fdp, int fd)
      {
      
              KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
                  ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
      
              return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
      }
      
      /*
       * Mark a file descriptor as used.
       */
      static void
      fdused_init(struct filedesc *fdp, int fd)
      {
      
              KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
      
              fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
      }
      
      static void
      fdused(struct filedesc *fdp, int fd)
      {
      
              FILEDESC_XLOCK_ASSERT(fdp);
      
              fdused_init(fdp, fd);
 9665         if (fd > fdp->fd_lastfile)
    1                 fdp->fd_lastfile = fd;
    9         if (fd == fdp->fd_freefile)
 9659                 fdp->fd_freefile++;
      }
      
      /*
       * Mark a file descriptor as unused.
       */
      static void
      fdunused(struct filedesc *fdp, int fd)
  110 {
      
              FILEDESC_XLOCK_ASSERT(fdp);
      
              KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
              KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
                  ("fd=%d is still in use", fd));
      
              fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
    7         if (fd < fdp->fd_freefile)
  109                 fdp->fd_freefile = fd;
  110         if (fd == fdp->fd_lastfile)
                      fdp->fd_lastfile = fd_last_used(fdp, fd);
      }
      
      /*
       * Free a file descriptor.
       *
       * Avoid some work if fdp is about to be destroyed.
       */
      static inline void
      fdefree_last(struct filedescent *fde)
      {
      
              filecaps_free(&fde->fde_caps);
      }
      
      static inline void
      fdfree(struct filedesc *fdp, int fd)
      {
              struct filedescent *fde;
      
              fde = &fdp->fd_ofiles[fd];
      #ifdef CAPABILITIES
              seqc_write_begin(&fde->fde_seqc);
      #endif
              fde->fde_file = NULL;
      #ifdef CAPABILITIES
              seqc_write_end(&fde->fde_seqc);
      #endif
              fdefree_last(fde);
              fdunused(fdp, fd);
      }
      
      /*
       * System calls on descriptors.
       */
      #ifndef _SYS_SYSPROTO_H_
      struct getdtablesize_args {
              int        dummy;
      };
      #endif
      /* ARGSUSED */
      int
      sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
      {
      #ifdef        RACCT
              uint64_t lim;
      #endif
      
              td->td_retval[0] = getmaxfd(td);
      #ifdef        RACCT
              PROC_LOCK(td->td_proc);
              lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
              PROC_UNLOCK(td->td_proc);
              if (lim < td->td_retval[0])
                      td->td_retval[0] = lim;
      #endif
              return (0);
      }
      
      /*
       * Duplicate a file descriptor to a particular value.
       *
       * Note: keep in mind that a potential race condition exists when closing
       * descriptors from a shared descriptor table (via rfork).
       */
      #ifndef _SYS_SYSPROTO_H_
      struct dup2_args {
              u_int        from;
              u_int        to;
      };
      #endif
      /* ARGSUSED */
      int
      sys_dup2(struct thread *td, struct dup2_args *uap)
  144 {
      
              return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
      }
      
      /*
       * Duplicate a file descriptor.
       */
      #ifndef _SYS_SYSPROTO_H_
      struct dup_args {
              u_int        fd;
      };
      #endif
      /* ARGSUSED */
      int
      sys_dup(struct thread *td, struct dup_args *uap)
   56 {
      
              return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
      }
      
      /*
       * The file control system call.
       */
      #ifndef _SYS_SYSPROTO_H_
      struct fcntl_args {
              int        fd;
              int        cmd;
              long        arg;
      };
      #endif
      /* ARGSUSED */
      int
      sys_fcntl(struct thread *td, struct fcntl_args *uap)
 1013 {
      
              return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
      }
      
      int
      kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 1013 {
              struct flock fl;
              struct __oflock ofl;
              intptr_t arg1;
              int error, newcmd;
      
              error = 0;
              newcmd = cmd;
 1013         switch (cmd) {
              case F_OGETLK:
              case F_OSETLK:
              case F_OSETLKW:
                      /*
                       * Convert old flock structure to new.
                       */
                      error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
                      fl.l_start = ofl.l_start;
                      fl.l_len = ofl.l_len;
                      fl.l_pid = ofl.l_pid;
                      fl.l_type = ofl.l_type;
                      fl.l_whence = ofl.l_whence;
                      fl.l_sysid = 0;
      
                      switch (cmd) {
                      case F_OGETLK:
                              newcmd = F_GETLK;
                              break;
                      case F_OSETLK:
                              newcmd = F_SETLK;
                              break;
                      case F_OSETLKW:
                              newcmd = F_SETLKW;
                              break;
                      }
                      arg1 = (intptr_t)&fl;
                      break;
              case F_GETLK:
              case F_SETLK:
              case F_SETLKW:
              case F_SETLK_REMOTE:
                      error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
                      arg1 = (intptr_t)&fl;
                      break;
              default:
                      arg1 = arg;
                      break;
              }
  942         if (error)
                      return (error);
              error = kern_fcntl(td, fd, newcmd, arg1);
   91         if (error)
                      return (error);
   96         if (cmd == F_OGETLK) {
    2                 ofl.l_start = fl.l_start;
                      ofl.l_len = fl.l_len;
                      ofl.l_pid = fl.l_pid;
                      ofl.l_type = fl.l_type;
                      ofl.l_whence = fl.l_whence;
                      error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
              } else if (cmd == F_GETLK) {
   11                 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
              }
              return (error);
      }
      
      int
      kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 1003 {
              struct filedesc *fdp;
              struct flock *flp;
              struct file *fp, *fp2;
              struct filedescent *fde;
              struct proc *p;
              struct vnode *vp;
              struct mount *mp;
              int error, flg, seals, tmp;
              uint64_t bsize;
              off_t foffset;
      
              error = 0;
              flg = F_POSIX;
              p = td->td_proc;
              fdp = p->p_fd;
      
 1003         AUDIT_ARG_FD(cmd);
              AUDIT_ARG_CMD(cmd);
   21         switch (cmd) {
              case F_DUPFD:
    6                 tmp = arg;
                      error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
                      break;
      
              case F_DUPFD_CLOEXEC:
    3                 tmp = arg;
                      error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
                      break;
      
              case F_DUP2FD:
    1                 tmp = arg;
                      error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
                      break;
      
              case F_DUP2FD_CLOEXEC:
    1                 tmp = arg;
                      error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
                      break;
      
              case F_GETFD:
                      error = EBADF;
                      FILEDESC_SLOCK(fdp);
                      fde = fdeget_locked(fdp, fd);
                      if (fde != NULL) {
                              td->td_retval[0] =
    2                             (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
                              error = 0;
                      }
                      FILEDESC_SUNLOCK(fdp);
                      break;
      
              case F_SETFD:
                      error = EBADF;
                      FILEDESC_XLOCK(fdp);
                      fde = fdeget_locked(fdp, fd);
                      if (fde != NULL) {
    1                         fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
                                  (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
                              error = 0;
                      }
                      FILEDESC_XUNLOCK(fdp);
                      break;
      
              case F_GETFL:
                      error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
    2                 if (error != 0)
                              break;
                      td->td_retval[0] = OFLAGS(fp->f_flag);
                      fdrop(fp, td);
                      break;
      
              case F_SETFL:
                      error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
    2                 if (error != 0)
                              break;
                      do {
                              tmp = flg = fp->f_flag;
                              tmp &= ~FCNTLFLAGS;
                              tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
                      } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
                      tmp = fp->f_flag & FNONBLOCK;
                      error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
                      if (error != 0) {
                              fdrop(fp, td);
                              break;
                      }
                      tmp = fp->f_flag & FASYNC;
                      error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
                      if (error == 0) {
                              fdrop(fp, td);
                              break;
                      }
                      atomic_clear_int(&fp->f_flag, FNONBLOCK);
                      tmp = 0;
                      (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
                      fdrop(fp, td);
                      break;
      
              case F_GETOWN:
                      error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
    3                 if (error != 0)
                              break;
                      error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
    3                 if (error == 0)
    3                         td->td_retval[0] = tmp;
                      fdrop(fp, td);
                      break;
      
              case F_SETOWN:
                      error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
    2                 if (error != 0)
                              break;
                      tmp = arg;
                      error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
                      fdrop(fp, td);
                      break;
      
              case F_SETLK_REMOTE:
                      error = priv_check(td, PRIV_NFS_LOCKD);
    1                 if (error != 0)
                              return (error);
                      flg = F_REMOTE;
                      goto do_setlk;
      
              case F_SETLKW:
                      flg |= F_WAIT;
                      /* FALLTHROUGH F_SETLK */
      
              case F_SETLK:
              do_setlk:
                      flp = (struct flock *)arg;
   35                 if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
                              error = EINVAL;
                              break;
                      }
      
                      error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
   10                 if (error != 0)
                              break;
                      if (fp->f_type != DTYPE_VNODE) {
                              error = EBADF;
                              fdrop(fp, td);
                              break;
                      }
      
   51                 if (flp->l_whence == SEEK_CUR) {
                              foffset = foffset_get(fp);
                              if (foffset < 0 ||
                                  (flp->l_start > 0 &&
                                   foffset > OFF_MAX - flp->l_start)) {
                                      error = EOVERFLOW;
                                      fdrop(fp, td);
                                      break;
                              }
  861                         flp->l_start += foffset;
                      }
      
                      vp = fp->f_vnode;
    2                 switch (flp->l_type) {
                      case F_RDLCK:
    2                         if ((fp->f_flag & FREAD) == 0) {
                                      error = EBADF;
                                      break;
                              }
   22                         if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
   20                                 PROC_LOCK(p->p_leader);
                                      p->p_leader->p_flag |= P_ADVLOCK;
                                      PROC_UNLOCK(p->p_leader);
                              }
                              error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
                                  flp, flg);
                              break;
                      case F_WRLCK:
    2                         if ((fp->f_flag & FWRITE) == 0) {
                                      error = EBADF;
                                      break;
                              }
  273                         if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
  591                                 PROC_LOCK(p->p_leader);
                                      p->p_leader->p_flag |= P_ADVLOCK;
                                      PROC_UNLOCK(p->p_leader);
                              }
                              error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
                                  flp, flg);
                              break;
                      case F_UNLCK:
    7                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
                                  flp, flg);
                              break;
                      case F_UNLCKSYS:
    2                         if (flg != F_REMOTE) {
                                      error = EINVAL;
                                      break;
                              }
    5                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
                                  F_UNLCKSYS, flp, flg);
                              break;
                      default:
                              error = EINVAL;
                              break;
                      }
   31                 if (error != 0 || flp->l_type == F_UNLCK ||
                          flp->l_type == F_UNLCKSYS) {
    1                         fdrop(fp, td);
                              break;
                      }
      
                      /*
                       * Check for a race with close.
                       *
                       * The vnode is now advisory locked (or unlocked, but this case
                       * is not really important) as the caller requested.
                       * We had to drop the filedesc lock, so we need to recheck if
                       * the descriptor is still valid, because if it was closed
                       * in the meantime we need to remove advisory lock from the
                       * vnode - close on any descriptor leading to an advisory
                       * locked vnode, removes that lock.
                       * We will return 0 on purpose in that case, as the result of
                       * successful advisory lock might have been externally visible
                       * already. This is fine - effectively we pretend to the caller
                       * that the closing thread was a bit slower and that the
                       * advisory lock succeeded before the close.
                       */
                      error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2);
                      if (error != 0) {
                              fdrop(fp, td);
                              break;
                      }
   58                 if (fp != fp2) {
    2                         flp->l_whence = SEEK_SET;
                              flp->l_start = 0;
                              flp->l_len = 0;
                              flp->l_type = F_UNLCK;
                              (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
                                  F_UNLCK, flp, F_POSIX);
                      }
    1                 fdrop(fp, td);
                      fdrop(fp2, td);
                      break;
      
              case F_GETLK:
                      error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
    3                 if (error != 0)
                              break;
                      if (fp->f_type != DTYPE_VNODE) {
                              error = EBADF;
                              fdrop(fp, td);
                              break;
                      }
                      flp = (struct flock *)arg;
                      if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
                          flp->l_type != F_UNLCK) {
                              error = EINVAL;
                              fdrop(fp, td);
                              break;
                      }
    9                 if (flp->l_whence == SEEK_CUR) {
                              foffset = foffset_get(fp);
                              if ((flp->l_start > 0 &&
                                  foffset > OFF_MAX - flp->l_start) ||
                                  (flp->l_start < 0 &&
                                  foffset < OFF_MIN - flp->l_start)) {
                                      error = EOVERFLOW;
                                      fdrop(fp, td);
                                      break;
                              }
    6                         flp->l_start += foffset;
                      }
                      vp = fp->f_vnode;
                      error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
                          F_POSIX);
                      fdrop(fp, td);
                      break;
      
              case F_ADD_SEALS:
                      error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
    2                 if (error != 0)
                              break;
                      error = fo_add_seals(fp, arg);
                      fdrop(fp, td);
                      break;
      
              case F_GET_SEALS:
                      error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
    2                 if (error != 0)
                              break;
                      if (fo_get_seals(fp, &seals) == 0)
                              td->td_retval[0] = seals;
                      else
                              error = EINVAL;
                      fdrop(fp, td);
                      break;
      
              case F_RDAHEAD:
    2                 arg = arg ? 128 * 1024: 0;
                      /* FALLTHROUGH */
              case F_READAHEAD:
                      error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
    2                 if (error != 0)
                              break;
                      if (fp->f_type != DTYPE_VNODE) {
                              fdrop(fp, td);
                              error = EBADF;
                              break;
                      }
                      vp = fp->f_vnode;
                      if (vp->v_type != VREG) {
                              fdrop(fp, td);
                              error = ENOTTY;
                              break;
                      }
      
                      /*
                       * Exclusive lock synchronizes against f_seqcount reads and
                       * writes in sequential_heuristic().
                       */
                      error = vn_lock(vp, LK_EXCLUSIVE);
                      if (error != 0) {
                              fdrop(fp, td);
                              break;
                      }
                      if (arg >= 0) {
    1                         bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
                              arg = MIN(arg, INT_MAX - bsize + 1);
                              fp->f_seqcount = MIN(IO_SEQMAX,
                                  (arg + bsize - 1) / bsize);
                              atomic_set_int(&fp->f_flag, FRDAHEAD);
                      } else {
    1                         atomic_clear_int(&fp->f_flag, FRDAHEAD);
                      }
                      VOP_UNLOCK(vp);
                      fdrop(fp, td);
                      break;
      
              case F_ISUNIONSTACK:
                      /*
                       * Check if the vnode is part of a union stack (either the
                       * "union" flag from mount(2) or unionfs).
                       *
                       * Prior to introduction of this op libc's readdir would call
                       * fstatfs(2), in effect unnecessarily copying kilobytes of
                       * data just to check fs name and a mount flag.
                       *
                       * Fixing the code to handle everything in the kernel instead
                       * is a non-trivial endeavor and has low priority, thus this
                       * horrible kludge facilitates the current behavior in a much
                       * cheaper manner until someone(tm) sorts this out.
                       */
                      error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
    1                 if (error != 0)
                              break;
                      if (fp->f_type != DTYPE_VNODE) {
                              fdrop(fp, td);
                              error = EBADF;
                              break;
                      }
                      vp = fp->f_vnode;
                      /*
                       * Since we don't prevent dooming th