/*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1982, 1986, 1988, 1993
       *        The Regents of the University of California.
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)raw_ip.c        8.7 (Berkeley) 5/15/95
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_inet.h"
      #include "opt_inet6.h"
      #include "opt_ipsec.h"
      
      #include <sys/param.h>
      #include <sys/jail.h>
      #include <sys/kernel.h>
      #include <sys/eventhandler.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mbuf.h>
      #include <sys/priv.h>
      #include <sys/proc.h>
      #include <sys/protosw.h>
      #include <sys/rmlock.h>
      #include <sys/rwlock.h>
      #include <sys/signalvar.h>
      #include <sys/socket.h>
      #include <sys/socketvar.h>
      #include <sys/sx.h>
      #include <sys/sysctl.h>
      #include <sys/systm.h>
      
      #include <vm/uma.h>
      
      #include <net/if.h>
      #include <net/if_var.h>
      #include <net/route.h>
      #include <net/vnet.h>
      
      #include <netinet/in.h>
      #include <netinet/in_systm.h>
      #include <netinet/in_pcb.h>
      #include <netinet/in_var.h>
      #include <netinet/if_ether.h>
      #include <netinet/ip.h>
      #include <netinet/ip_var.h>
      #include <netinet/ip_mroute.h>
      #include <netinet/ip_icmp.h>
      
      #include <netipsec/ipsec_support.h>
      
      #include <machine/stdarg.h>
      #include <security/mac/mac_framework.h>
      
      VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
      SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW,
          &VNET_NAME(ip_defttl), 0,
          "Maximum TTL on IP packets");
      
      VNET_DEFINE(struct inpcbhead, ripcb);
      VNET_DEFINE(struct inpcbinfo, ripcbinfo);
      
      #define        V_ripcb                        VNET(ripcb)
      #define        V_ripcbinfo                VNET(ripcbinfo)
      
      /*
       * Control and data hooks for ipfw, dummynet, divert and so on.
       * The data hooks are not used here but it is convenient
       * to keep them all in one place.
       */
      VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
      VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
      
      int        (*ip_dn_ctl_ptr)(struct sockopt *);
      int        (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
      void        (*ip_divert_ptr)(struct mbuf *, bool);
      int        (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
      
      #ifdef INET
      /*
       * Hooks for multicast routing. They all default to NULL, so leave them not
       * initialized and rely on BSS being set to 0.
       */
      
      /*
       * The socket used to communicate with the multicast routing daemon.
       */
      VNET_DEFINE(struct socket *, ip_mrouter);
      
      /*
       * The various mrouter and rsvp functions.
       */
      int (*ip_mrouter_set)(struct socket *, struct sockopt *);
      int (*ip_mrouter_get)(struct socket *, struct sockopt *);
      int (*ip_mrouter_done)(void);
      int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
                         struct ip_moptions *);
      int (*mrt_ioctl)(u_long, caddr_t, int);
      int (*legal_vif_num)(int);
      u_long (*ip_mcast_src)(int);
      
      int (*rsvp_input_p)(struct mbuf **, int *, int);
      int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
      void (*ip_rsvp_force_done)(struct socket *);
      #endif /* INET */
      
      extern        struct protosw inetsw[];
      
      u_long        rip_sendspace = 9216;
      SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
          &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
      
      u_long        rip_recvspace = 9216;
      SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
          &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
      
      /*
       * Hash functions
       */
      
      #define INP_PCBHASH_RAW_SIZE        256
      #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
              (((proto) + (laddr) + (faddr)) % (mask) + 1)
      
      #ifdef INET
      static void
      rip_inshash(struct inpcb *inp)
      {
              struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
              struct inpcbhead *pcbhash;
              int hash;
      
              INP_INFO_WLOCK_ASSERT(pcbinfo);
              INP_WLOCK_ASSERT(inp);
      
    4         if (inp->inp_ip_p != 0 &&
    3             inp->inp_laddr.s_addr != INADDR_ANY &&
                  inp->inp_faddr.s_addr != INADDR_ANY) {
    4                 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
                          inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
              } else
                      hash = 0;
              pcbhash = &pcbinfo->ipi_hashbase[hash];
   11         CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
      }
      
      static void
      rip_delhash(struct inpcb *inp)
      {
      
              INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
              INP_WLOCK_ASSERT(inp);
      
   10         CK_LIST_REMOVE(inp, inp_hash);
      }
      #endif /* INET */
      
      /*
       * Raw interface to IP protocol.
       */
      
      /*
       * Initialize raw connection block q.
       */
      static void
      rip_zone_change(void *tag)
      {
      
              uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
      }
      
      static int
      rip_inpcb_init(void *mem, int size, int flags)
      {
              struct inpcb *inp = mem;
      
              INP_LOCK_INIT(inp, "inp", "rawinp");
              return (0);
      }
      
      void
      rip_init(void)
      {
      
              in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
                  1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE);
              EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
                  EVENTHANDLER_PRI_ANY);
      }
      
      #ifdef VIMAGE
      static void
      rip_destroy(void *unused __unused)
      {
      
              in_pcbinfo_destroy(&V_ripcbinfo);
      }
      VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL);
      #endif
      
      #ifdef INET
      static int
      rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
          struct sockaddr_in *ripsrc)
    8 {
              int policyfail = 0;
      
              INP_LOCK_ASSERT(last);
      
      #if defined(IPSEC) || defined(IPSEC_SUPPORT)
              /* check AH/ESP integrity. */
    8         if (IPSEC_ENABLED(ipv4)) {
                      if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0)
                              policyfail = 1;
              }
      #endif /* IPSEC */
      #ifdef MAC
              if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
                      policyfail = 1;
      #endif
              /* Check the minimum TTL for socket. */
              if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
                      policyfail = 1;
    8         if (!policyfail) {
                      struct mbuf *opts = NULL;
                      struct socket *so;
      
                      so = last->inp_socket;
    8                 if ((last->inp_flags & INP_CONTROLOPTS) ||
                          (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
                              ip_savecontrol(last, &opts, ip, n);
                      SOCKBUF_LOCK(&so->so_rcv);
                      if (sbappendaddr_locked(&so->so_rcv,
                          (struct sockaddr *)ripsrc, n, opts) == 0) {
                              /* should notify about lost packet */
                              m_freem(n);
    1                         if (opts)
                                      m_freem(opts);
                              SOCKBUF_UNLOCK(&so->so_rcv);
                      } else
    7                         sorwakeup_locked(so);
              } else
                      m_freem(n);
              return (policyfail);
      }
      
      /*
       * Setup generic address and protocol structures for raw_input routine, then
       * pass them along with mbuf chain.
       */
      int
      rip_input(struct mbuf **mp, int *offp, int proto)
   21 {
              struct ifnet *ifp;
              struct mbuf *m = *mp;
              struct ip *ip = mtod(m, struct ip *);
              struct inpcb *inp, *last;
              struct sockaddr_in ripsrc;
              int hash;
      
              NET_EPOCH_ASSERT();
      
              *mp = NULL;
      
              bzero(&ripsrc, sizeof(ripsrc));
              ripsrc.sin_len = sizeof(ripsrc);
              ripsrc.sin_family = AF_INET;
              ripsrc.sin_addr = ip->ip_src;
              last = NULL;
      
              ifp = m->m_pkthdr.rcvif;
      
              hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
                  ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
   21         CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
                      if (inp->inp_ip_p != proto)
                              continue;
      #ifdef INET6
                      /* XXX inp locking */
                      if ((inp->inp_vflag & INP_IPV4) == 0)
                              continue;
      #endif
                      if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
                              continue;
                      if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
                              continue;
                      if (last != NULL) {
                              struct mbuf *n;
      
                              n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
                              if (n != NULL)
                                  (void) rip_append(last, ip, n, &ripsrc);
                              /* XXX count dropped packet */
                              INP_RUNLOCK(last);
                              last = NULL;
                      }
                      INP_RLOCK(inp);
                      if (__predict_false(inp->inp_flags2 & INP_FREED))
                              goto skip_1;
                      if (jailed_without_vnet(inp->inp_cred)) {
                              /*
                               * XXX: If faddr was bound to multicast group,
                               * jailed raw socket will drop datagram.
                               */
                              if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
                                      goto skip_1;
                      }
                      last = inp;
                      continue;
              skip_1:
                      INP_RUNLOCK(inp);
              }
   21         CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
   21                 if (inp->inp_ip_p && inp->inp_ip_p != proto)
                              continue;
      #ifdef INET6
                      /* XXX inp locking */
                      if ((inp->inp_vflag & INP_IPV4) == 0)
                              continue;
      #endif
   10                 if (!in_nullhost(inp->inp_laddr) &&
                          !in_hosteq(inp->inp_laddr, ip->ip_dst))
                              continue;
   10                 if (!in_nullhost(inp->inp_faddr) &&
                          !in_hosteq(inp->inp_faddr, ip->ip_src))
                              continue;
   10                 if (last != NULL) {
                              struct mbuf *n;
      
                              n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
                              if (n != NULL)
    1                                 (void) rip_append(last, ip, n, &ripsrc);
                              /* XXX count dropped packet */
                              INP_RUNLOCK(last);
                              last = NULL;
                      }
                      INP_RLOCK(inp);
                      if (__predict_false(inp->inp_flags2 & INP_FREED))
                              goto skip_2;
   10                 if (jailed_without_vnet(inp->inp_cred)) {
                              /*
                               * Allow raw socket in jail to receive multicast;
                               * assume process had PRIV_NETINET_RAW at attach,
                               * and fall through into normal filter path if so.
                               */
                              if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
                                  prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
                                      goto skip_2;
                      }
                      /*
                       * If this raw socket has multicast state, and we
                       * have received a multicast, check if this socket
                       * should receive it, as multicast filtering is now
                       * the responsibility of the transport layer.
                       */
    7                 if (inp->inp_moptions != NULL &&
                          IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
                              /*
                               * If the incoming datagram is for IGMP, allow it
                               * through unconditionally to the raw socket.
                               *
                               * In the case of IGMPv2, we may not have explicitly
                               * joined the group, and may have set IFF_ALLMULTI
                               * on the interface. imo_multi_filter() may discard
                               * control traffic we actually need to see.
                               *
                               * Userland multicast routing daemons should continue
                               * filter the control traffic appropriately.
                               */
                              int blocked;
      
                              blocked = MCAST_PASS;
                              if (proto != IPPROTO_IGMP) {
                                      struct sockaddr_in group;
      
                                      bzero(&group, sizeof(struct sockaddr_in));
                                      group.sin_len = sizeof(struct sockaddr_in);
                                      group.sin_family = AF_INET;
                                      group.sin_addr = ip->ip_dst;
      
                                      blocked = imo_multi_filter(inp->inp_moptions,
                                          ifp,
                                          (struct sockaddr *)&group,
                                          (struct sockaddr *)&ripsrc);
                              }
      
    1                         if (blocked != MCAST_PASS) {
                                      IPSTAT_INC(ips_notmember);
                                      goto skip_2;
                              }
                      }
                      last = inp;
                      continue;
              skip_2:
                      INP_RUNLOCK(inp);
              }
              if (last != NULL) {
    8                 if (rip_append(last, ip, m, &ripsrc) != 0)
                              IPSTAT_INC(ips_delivered);
                      INP_RUNLOCK(last);
              } else {
                      if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) {
                              IPSTAT_INC(ips_noproto);
                              IPSTAT_DEC(ips_delivered);
                              icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
                      } else {
    7                         m_freem(m);
                      }
              }
              return (IPPROTO_DONE);
      }
      
      /*
       * Generate IP header and pass packet to ip_output.  Tack on options user may
       * have setup with control call.
       */
      int
      rip_output(struct mbuf *m, struct socket *so, ...)
   12 {
              struct epoch_tracker et;
              struct ip *ip;
              int error;
              struct inpcb *inp = sotoinpcb(so);
              va_list ap;
              u_long dst;
              int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
                  IP_ALLOWBROADCAST;
              int cnt, hlen;
              u_char opttype, optlen, *cp;
      
              va_start(ap, so);
   12         dst = va_arg(ap, u_long);
              va_end(ap);
      
              /*
               * If the user handed us a complete IP packet, use it.  Otherwise,
               * allocate an mbuf for a header and fill it in.
               */
              if ((inp->inp_flags & INP_HDRINCL) == 0) {
                      if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
                              m_freem(m);
                              return(EMSGSIZE);
                      }
    7                 M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
                      if (m == NULL)
                              return(ENOBUFS);
      
                      INP_RLOCK(inp);
                      ip = mtod(m, struct ip *);
                      ip->ip_tos = inp->inp_ip_tos;
                      if (inp->inp_flags & INP_DONTFRAG)
                              ip->ip_off = htons(IP_DF);
                      else
                              ip->ip_off = htons(0);
                      ip->ip_p = inp->inp_ip_p;
                      ip->ip_len = htons(m->m_pkthdr.len);
                      ip->ip_src = inp->inp_laddr;
                      ip->ip_dst.s_addr = dst;
    7                 if (jailed(inp->inp_cred)) {
                              /*
                               * prison_local_ip4() would be good enough but would
                               * let a source of INADDR_ANY pass, which we do not
                               * want to see from jails.
                               */
                              if (ip->ip_src.s_addr == INADDR_ANY) {
                                      NET_EPOCH_ENTER(et);
                                      error = in_pcbladdr(inp, &ip->ip_dst,
                                          &ip->ip_src, inp->inp_cred);
                                      NET_EPOCH_EXIT(et);
                              } else {
                                      error = prison_local_ip4(inp->inp_cred,
                                          &ip->ip_src);
                              }
                              if (error != 0) {
                                      INP_RUNLOCK(inp);
                                      m_freem(m);
                                      return (error);
                              }
                      }
                      ip->ip_ttl = inp->inp_ip_ttl;
              } else {
                      if (m->m_pkthdr.len > IP_MAXPACKET) {
                              m_freem(m);
                              return(EMSGSIZE);
                      }
                      ip = mtod(m, struct ip *);
                      hlen = ip->ip_hl << 2;
    5                 if (m->m_len < hlen) {
                              m = m_pullup(m, hlen);
                              if (m == NULL)
                                      return (EINVAL);
                              ip = mtod(m, struct ip *);
                      }
      
                      INP_RLOCK(inp);
                      /*
                       * Don't allow both user specified and setsockopt options,
                       * and don't allow packet length sizes that will crash.
                       */
                      if ((hlen < sizeof (*ip))
    1                     || ((hlen > sizeof (*ip)) && inp->inp_options)
    4                     || (ntohs(ip->ip_len) != m->m_pkthdr.len)) {
                              INP_RUNLOCK(inp);
                              m_freem(m);
                              return (EINVAL);
                      }
                      error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
                      if (error != 0) {
                              INP_RUNLOCK(inp);
                              m_freem(m);
                              return (error);
                      }
                      /*
                       * Don't allow IP options which do not have the required
                       * structure as specified in section 3.1 of RFC 791 on
                       * pages 15-23.
                       */
                      cp = (u_char *)(ip + 1);
                      cnt = hlen - sizeof (struct ip);
    4                 for (; cnt > 0; cnt -= optlen, cp += optlen) {
                              opttype = cp[IPOPT_OPTVAL];
                              if (opttype == IPOPT_EOL)
                                      break;
                              if (opttype == IPOPT_NOP) {
                                      optlen = 1;
                                      continue;
                              }
                              if (cnt < IPOPT_OLEN + sizeof(u_char)) {
    1                                 INP_RUNLOCK(inp);
                                      m_freem(m);
                                      return (EINVAL);
                              }
                              optlen = cp[IPOPT_OLEN];
    3                         if (optlen < IPOPT_OLEN + sizeof(u_char) ||
                                  optlen > cnt) {
    1                                 INP_RUNLOCK(inp);
                                      m_freem(m);
                                      return (EINVAL);
                              }
                      }
                      /*
                       * This doesn't allow application to specify ID of zero,
                       * but we got this limitation from the beginning of history.
                       */
    2                 if (ip->ip_id == 0)
                              ip_fillid(ip);
      
                      /*
                       * XXX prevent ip_output from overwriting header fields.
                       */
    2                 flags |= IP_RAWOUTPUT;
                      IPSTAT_INC(ips_rawout);
              }
      
              if (inp->inp_flags & INP_ONESBCAST)
                      flags |= IP_SENDONES;
      
      #ifdef MAC
              mac_inpcb_create_mbuf(inp, m);
      #endif
      
              NET_EPOCH_ENTER(et);
              error = ip_output(m, inp->inp_options, NULL, flags,
                  inp->inp_moptions, inp);
              NET_EPOCH_EXIT(et);
              INP_RUNLOCK(inp);
              return (error);
      }
      
      /*
       * Raw IP socket option processing.
       *
       * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
       * only be created by a privileged process, and as such, socket option
       * operations to manage system properties on any raw socket were allowed to
       * take place without explicit additional access control checks.  However,
       * raw sockets can now also be created in jail(), and therefore explicit
       * checks are now required.  Likewise, raw sockets can be used by a process
       * after it gives up privilege, so some caution is required.  For options
       * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
       * performed in ip_ctloutput() and therefore no check occurs here.
       * Unilaterally checking priv_check() here breaks normal IP socket option
       * operations on raw sockets.
       *
       * When adding new socket options here, make sure to add access control
       * checks here as necessary.
       *
       * XXX-BZ inp locking?
       */
      int
      rip_ctloutput(struct socket *so, struct sockopt *sopt)
   93 {
              struct        inpcb *inp = sotoinpcb(so);
              int        error, optval;
      
    2         if (sopt->sopt_level != IPPROTO_IP) {
    2                 if ((sopt->sopt_level == SOL_SOCKET) &&
                          (sopt->sopt_name == SO_SETFIB)) {
    1                         inp->inp_inc.inc_fibnum = so->so_fibnum;
                              return (0);
                      }
                      return (EINVAL);
              }
      
              error = 0;
              switch (sopt->sopt_dir) {
              case SOPT_GET:
    9                 switch (sopt->sopt_name) {
                      case IP_HDRINCL:
                              optval = inp->inp_flags & INP_HDRINCL;
                              error = sooptcopyout(sopt, &optval, sizeof optval);
                              break;
      
                      case IP_FW3:        /* generic ipfw v.3 functions */
                      case IP_FW_ADD:        /* ADD actually returns the body... */
                      case IP_FW_GET:
                      case IP_FW_TABLE_GETSIZE:
                      case IP_FW_TABLE_LIST:
                      case IP_FW_NAT_GET_CONFIG:
                      case IP_FW_NAT_GET_LOG:
    3                         if (V_ip_fw_ctl_ptr != NULL)
                                      error = V_ip_fw_ctl_ptr(sopt);
                              else
                                      error = ENOPROTOOPT;
                              break;
      
                      case IP_DUMMYNET3:        /* generic dummynet v.3 functions */
                      case IP_DUMMYNET_GET:
    1                         if (ip_dn_ctl_ptr != NULL)
                                      error = ip_dn_ctl_ptr(sopt);
                              else
                                      error = ENOPROTOOPT;
                              break ;
      
                      case MRT_INIT:
                      case MRT_DONE:
                      case MRT_ADD_VIF:
                      case MRT_DEL_VIF:
                      case MRT_ADD_MFC:
                      case MRT_DEL_MFC:
                      case MRT_VERSION:
                      case MRT_ASSERT:
                      case MRT_API_SUPPORT:
                      case MRT_API_CONFIG:
                      case MRT_ADD_BW_UPCALL:
                      case MRT_DEL_BW_UPCALL:
                              error = priv_check(curthread, PRIV_NETINET_MROUTE);
    1                         if (error != 0)
                                      return (error);
    4                         error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
                                      EOPNOTSUPP;
                              break;
      
                      default:
    6                         error = ip_ctloutput(so, sopt);
                              break;
                      }
                      break;
      
              case SOPT_SET:
    7                 switch (sopt->sopt_name) {
                      case IP_HDRINCL:
                              error = sooptcopyin(sopt, &optval, sizeof optval,
                                                  sizeof optval);
                              if (error)
                                      break;
                              if (optval)
                                      inp->inp_flags |= INP_HDRINCL;
                              else
                                      inp->inp_flags &= ~INP_HDRINCL;
                              break;
      
                      case IP_FW3:        /* generic ipfw v.3 functions */
                      case IP_FW_ADD:
                      case IP_FW_DEL:
                      case IP_FW_FLUSH:
                      case IP_FW_ZERO:
                      case IP_FW_RESETLOG:
                      case IP_FW_TABLE_ADD:
                      case IP_FW_TABLE_DEL:
                      case IP_FW_TABLE_FLUSH:
                      case IP_FW_NAT_CFG:
                      case IP_FW_NAT_DEL:
    4                         if (V_ip_fw_ctl_ptr != NULL)
                                      error = V_ip_fw_ctl_ptr(sopt);
                              else
                                      error = ENOPROTOOPT;
                              break;
      
                      case IP_DUMMYNET3:        /* generic dummynet v.3 functions */
                      case IP_DUMMYNET_CONFIGURE:
                      case IP_DUMMYNET_DEL:
                      case IP_DUMMYNET_FLUSH:
                              if (ip_dn_ctl_ptr != NULL)
                                      error = ip_dn_ctl_ptr(sopt);
                              else
                                      error = ENOPROTOOPT ;
                              break ;
      
                      case IP_RSVP_ON:
                              error = priv_check(curthread, PRIV_NETINET_MROUTE);
    1                         if (error != 0)
                                      return (error);
                              error = ip_rsvp_init(so);
                              break;
      
                      case IP_RSVP_OFF:
                              error = priv_check(curthread, PRIV_NETINET_MROUTE);
                              if (error != 0)
                                      return (error);
                              error = ip_rsvp_done();
                              break;
      
                      case IP_RSVP_VIF_ON:
                      case IP_RSVP_VIF_OFF:
                              error = priv_check(curthread, PRIV_NETINET_MROUTE);
                              if (error != 0)
                                      return (error);
    1                         error = ip_rsvp_vif ?
                                      ip_rsvp_vif(so, sopt) : EINVAL;
                              break;
      
                      case MRT_INIT:
                      case MRT_DONE:
                      case MRT_ADD_VIF:
                      case MRT_DEL_VIF:
                      case MRT_ADD_MFC:
                      case MRT_DEL_MFC:
                      case MRT_VERSION:
                      case MRT_ASSERT:
                      case MRT_API_SUPPORT:
                      case MRT_API_CONFIG:
                      case MRT_ADD_BW_UPCALL:
                      case MRT_DEL_BW_UPCALL:
                              error = priv_check(curthread, PRIV_NETINET_MROUTE);
                              if (error != 0)
                                      return (error);
    2                         error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
                                              EOPNOTSUPP;
                              break;
      
                      default:
   65                         error = ip_ctloutput(so, sopt);
                              break;
                      }
                      break;
              }
      
              return (error);
      }
      
      /*
       * This function exists solely to receive the PRC_IFDOWN messages which are
       * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
       * in_ifadown() to remove all routes corresponding to that address.  It also
       * receives the PRC_IFUP messages from if_up() and reinstalls the interface
       * routes.
       */
      void
      rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
    1 {
              struct rm_priotracker in_ifa_tracker;
              struct in_ifaddr *ia;
              struct ifnet *ifp;
              int err;
              int flags;
      
    1         switch (cmd) {
              case PRC_IFDOWN:
                      IN_IFADDR_RLOCK(&in_ifa_tracker);
                      CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
                              if (ia->ia_ifa.ifa_addr == sa
                                  && (ia->ia_flags & IFA_ROUTE)) {
                                      ifa_ref(&ia->ia_ifa);
                                      IN_IFADDR_RUNLOCK(&in_ifa_tracker);
                                      /*
                                       * in_scrubprefix() kills the interface route.
                                       */
                                      in_scrubprefix(ia, 0);
                                      /*
                                       * in_ifadown gets rid of all the rest of the
                                       * routes.  This is not quite the right thing
                                       * to do, but at least if we are running a
                                       * routing process they will come back.
                                       */
                                      in_ifadown(&ia->ia_ifa, 0);
                                      ifa_free(&ia->ia_ifa);
                                      break;
                              }
                      }
                      if (ia == NULL)                /* If ia matched, already unlocked. */
                              IN_IFADDR_RUNLOCK(&in_ifa_tracker);
                      break;
      
              case PRC_IFUP:
                      IN_IFADDR_RLOCK(&in_ifa_tracker);
                      CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
                              if (ia->ia_ifa.ifa_addr == sa)
                                      break;
                      }
                      if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) {
                              IN_IFADDR_RUNLOCK(&in_ifa_tracker);
                              return;
                      }
                      ifa_ref(&ia->ia_ifa);
                      IN_IFADDR_RUNLOCK(&in_ifa_tracker);
                      flags = RTF_UP;
                      ifp = ia->ia_ifa.ifa_ifp;
      
                      if ((ifp->if_flags & IFF_LOOPBACK)
                          || (ifp->if_flags & IFF_POINTOPOINT))
                              flags |= RTF_HOST;
      
                      err = ifa_del_loopback_route((struct ifaddr *)ia, sa);
      
                      err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
                      if (err == 0)
                              ia->ia_flags |= IFA_ROUTE;
      
                      err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
      
                      ifa_free(&ia->ia_ifa);
                      break;
              }
      }
      
      static int
      rip_attach(struct socket *so, int proto, struct thread *td)
    2 {
              struct inpcb *inp;
              int error;
      
              inp = sotoinpcb(so);
              KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
      
              error = priv_check(td, PRIV_NETINET_RAW);
              if (error)
                      return (error);
    1         if (proto >= IPPROTO_MAX || proto < 0)
                      return EPROTONOSUPPORT;
              error = soreserve(so, rip_sendspace, rip_recvspace);
              if (error)
                      return (error);
              INP_INFO_WLOCK(&V_ripcbinfo);
              error = in_pcballoc(so, &V_ripcbinfo);
              if (error) {
                      INP_INFO_WUNLOCK(&V_ripcbinfo);
                      return (error);
              }
              inp = (struct inpcb *)so->so_pcb;
              inp->inp_vflag |= INP_IPV4;
              inp->inp_ip_p = proto;
              inp->inp_ip_ttl = V_ip_defttl;
    1         rip_inshash(inp);
              INP_INFO_WUNLOCK(&V_ripcbinfo);
              INP_WUNLOCK(inp);
              return (0);
      }
      
      static void
      rip_detach(struct socket *so)
    2 {
              struct inpcb *inp;
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
              KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
                  ("rip_detach: not closed"));
      
              INP_INFO_WLOCK(&V_ripcbinfo);
              INP_WLOCK(inp);
    2         rip_delhash(inp);
    2         if (so == V_ip_mrouter && ip_mrouter_done)
                      ip_mrouter_done();
    2         if (ip_rsvp_force_done)
                      ip_rsvp_force_done(so);
    2         if (so == V_ip_rsvpd)
                      ip_rsvp_done();
              in_pcbdetach(inp);
              in_pcbfree(inp);
              INP_INFO_WUNLOCK(&V_ripcbinfo);
      }
      
      static void
      rip_dodisconnect(struct socket *so, struct inpcb *inp)
    2 {
              struct inpcbinfo *pcbinfo;
      
              pcbinfo = inp->inp_pcbinfo;
              INP_INFO_WLOCK(pcbinfo);
              INP_WLOCK(inp);
    2         rip_delhash(inp);
              inp->inp_faddr.s_addr = INADDR_ANY;
    2         rip_inshash(inp);
              SOCK_LOCK(so);
              so->so_state &= ~SS_ISCONNECTED;
              SOCK_UNLOCK(so);
              INP_WUNLOCK(inp);
              INP_INFO_WUNLOCK(pcbinfo);
      }
      
      static void
      rip_abort(struct socket *so)
      {
              struct inpcb *inp;
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
      
              rip_dodisconnect(so, inp);
      }
      
      static void
      rip_close(struct socket *so)
    2 {
              struct inpcb *inp;
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_close: inp == NULL"));
      
    2         rip_dodisconnect(so, inp);
      }
      
      static int
      rip_disconnect(struct socket *so)
    1 {
              struct inpcb *inp;
      
              if ((so->so_state & SS_ISCONNECTED) == 0)
                      return (ENOTCONN);
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
      
    1         rip_dodisconnect(so, inp);
              return (0);
      }
      
      static int
      rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
    5 {
              struct sockaddr_in *addr = (struct sockaddr_in *)nam;
              struct inpcb *inp;
              int error;
      
              if (nam->sa_len != sizeof(*addr))
                      return (EINVAL);
      
              error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
              if (error != 0)
                      return (error);
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
      
    3         if (CK_STAILQ_EMPTY(&V_ifnet) ||
    1             (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
    1             (addr->sin_addr.s_addr &&
                   (inp->inp_flags & INP_BINDANY) == 0 &&
                   ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
                      return (EADDRNOTAVAIL);
      
              INP_INFO_WLOCK(&V_ripcbinfo);
              INP_WLOCK(inp);
    4         rip_delhash(inp);
              inp->inp_laddr = addr->sin_addr;
    4         rip_inshash(inp);
              INP_WUNLOCK(inp);
              INP_INFO_WUNLOCK(&V_ripcbinfo);
              return (0);
      }
      
      static int
      rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
    4 {
              struct sockaddr_in *addr = (struct sockaddr_in *)nam;
              struct inpcb *inp;
      
              if (nam->sa_len != sizeof(*addr))
                      return (EINVAL);
              if (CK_STAILQ_EMPTY(&V_ifnet))
                      return (EADDRNOTAVAIL);
              if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
                      return (EAFNOSUPPORT);
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
      
              INP_INFO_WLOCK(&V_ripcbinfo);
              INP_WLOCK(inp);
              rip_delhash(inp);
              inp->inp_faddr = addr->sin_addr;
    4         rip_inshash(inp);
              soisconnected(so);
              INP_WUNLOCK(inp);
              INP_INFO_WUNLOCK(&V_ripcbinfo);
              return (0);
      }
      
      static int
      rip_shutdown(struct socket *so)
    2 {
              struct inpcb *inp;
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
      
    2         INP_WLOCK(inp);
              socantsendmore(so);
              INP_WUNLOCK(inp);
              return (0);
      }
      
      static int
      rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
          struct mbuf *control, struct thread *td)
   12 {
              struct inpcb *inp;
              u_long dst;
      
              inp = sotoinpcb(so);
              KASSERT(inp != NULL, ("rip_send: inp == NULL"));
      
              /*
               * Note: 'dst' reads below are unlocked.
               */
              if (so->so_state & SS_ISCONNECTED) {
                      if (nam) {
                              m_freem(m);
                              return (EISCONN);
                      }
    1                 dst = inp->inp_faddr.s_addr;        /* Unlocked read. */
              } else {
                      if (nam == NULL) {
                              m_freem(m);
                              return (ENOTCONN);
                      }
   11                 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
              }
              return (rip_output(m, so, dst));
      }
      #endif /* INET */
      
      static int
      rip_pcblist(SYSCTL_HANDLER_ARGS)
      {
              struct xinpgen xig;
              struct epoch_tracker et;
              struct inpcb *inp;
              int error;
      
              if (req->newptr != 0)
                      return (EPERM);
      
              if (req->oldptr == 0) {
                      int n;
      
                      n = V_ripcbinfo.ipi_count;
                      n += imax(n / 8, 10);
                      req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
                      return (0);
              }
      
              if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
                      return (error);
      
              bzero(&xig, sizeof(xig));
              xig.xig_len = sizeof xig;
              xig.xig_count = V_ripcbinfo.ipi_count;
              xig.xig_gen = V_ripcbinfo.ipi_gencnt;
              xig.xig_sogen = so_gencnt;
              error = SYSCTL_OUT(req, &xig, sizeof xig);
              if (error)
                      return (error);
      
              NET_EPOCH_ENTER(et);
              for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead);
                  inp != NULL;
                  inp = CK_LIST_NEXT(inp, inp_list)) {
                      INP_RLOCK(inp);
                      if (inp->inp_gencnt <= xig.xig_gen &&
                          cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
                              struct xinpcb xi;
      
                              in_pcbtoxinpcb(inp, &xi);
                              INP_RUNLOCK(inp);
                              error = SYSCTL_OUT(req, &xi, sizeof xi);
                              if (error)
                                      break;
                      } else
                              INP_RUNLOCK(inp);
              }
              NET_EPOCH_EXIT(et);
      
              if (!error) {
                      /*
                       * Give the user an updated idea of our state.  If the
                       * generation differs from what we told her before, she knows
                       * that something happened while we were processing this
                       * request, and it might be necessary to retry.
                       */
                      xig.xig_gen = V_ripcbinfo.ipi_gencnt;
                      xig.xig_sogen = so_gencnt;
                      xig.xig_count = V_ripcbinfo.ipi_count;
                      error = SYSCTL_OUT(req, &xig, sizeof xig);
              }
      
              return (error);
      }
      
      SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
          CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
          rip_pcblist, "S,xinpcb",
          "List of active raw IP sockets");
      
      #ifdef INET
      struct pr_usrreqs rip_usrreqs = {
              .pru_abort =                rip_abort,
              .pru_attach =                rip_attach,
              .pru_bind =                rip_bind,
              .pru_connect =                rip_connect,
              .pru_control =                in_control,
              .pru_detach =                rip_detach,
              .pru_disconnect =        rip_disconnect,
              .pru_peeraddr =                in_getpeeraddr,
              .pru_send =                rip_send,
              .pru_shutdown =                rip_shutdown,
              .pru_sockaddr =                in_getsockaddr,
              .pru_sosetlabel =        in_pcbsosetlabel,
              .pru_close =                rip_close,
      };
      #endif /* INET */
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2005 Poul-Henning Kamp
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/malloc.h>
      #include <sys/mount.h>
      #include <sys/rwlock.h>
      #include <sys/vnode.h>
      
      static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
      
      static LIST_HEAD(vfs_hash_head, vnode)        *vfs_hash_tbl;
      static LIST_HEAD(,vnode)                vfs_hash_side;
      static u_long                                vfs_hash_mask;
      static struct rwlock                        vfs_hash_lock;
      
      static void
      vfs_hashinit(void *dummy __unused)
      {
      
              vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask);
              rw_init(&vfs_hash_lock, "vfs hash");
              LIST_INIT(&vfs_hash_side);
      }
      
      /* Must be SI_ORDER_SECOND so desiredvnodes is available */
      SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL);
      
      u_int
      vfs_hash_index(struct vnode *vp)
      {
      
              return (vp->v_hash + vp->v_mount->mnt_hashseed);
      }
      
      static struct vfs_hash_head *
      vfs_hash_bucket(const struct mount *mp, u_int hash)
      {
      
              return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
      }
      
      int
      vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td,
          struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
 1529 {
              struct vnode *vp;
              enum vgetstate vs;
              int error;
      
              while (1) {
                      rw_rlock(&vfs_hash_lock);
   45                 LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
                              if (vp->v_hash != hash)
                                      continue;
                              if (vp->v_mount != mp)
                                      continue;
 1500                         if (fn != NULL && fn(vp, arg))
                                      continue;
                              vs = vget_prep(vp);
                              rw_runlock(&vfs_hash_lock);
                              error = vget_finish(vp, flags, vs);
                              if (error == ENOENT && (flags & LK_NOWAIT) == 0)
                                      break;
 1500                         if (error)
                                      return (error);
                              *vpp = vp;
                              return (0);
                      }
                      if (vp == NULL) {
                              rw_runlock(&vfs_hash_lock);
                              *vpp = NULL;
                              return (0);
                      }
              }
      }
      
      void
      vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
          struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
      {
              struct vnode *vp;
      
              while (1) {
                      rw_rlock(&vfs_hash_lock);
                      LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
                              if (vp->v_hash != hash)
                                      continue;
                              if (vp->v_mount != mp)
                                      continue;
                              if (fn != NULL && fn(vp, arg))
                                      continue;
                              vhold(vp);
                              rw_runlock(&vfs_hash_lock);
                              vref(vp);
                              vdrop(vp);
                              *vpp = vp;
                              return;
                      }
                      if (vp == NULL) {
                              rw_runlock(&vfs_hash_lock);
                              *vpp = NULL;
                              return;
                      }
              }
      }
      
      void
      vfs_hash_remove(struct vnode *vp)
   11 {
      
              rw_wlock(&vfs_hash_lock);
   11         LIST_REMOVE(vp, v_hashlist);
              rw_wunlock(&vfs_hash_lock);
      }
      
      int
      vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
          struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
   45 {
              struct vnode *vp2;
              enum vgetstate vs;
              int error;
      
              *vpp = NULL;
              while (1) {
                      rw_wlock(&vfs_hash_lock);
   45                 LIST_FOREACH(vp2,
                          vfs_hash_bucket(vp->v_mount, hash), v_hashlist) {
                              if (vp2->v_hash != hash)
                                      continue;
                              if (vp2->v_mount != vp->v_mount)
                                      continue;
                              if (fn != NULL && fn(vp2, arg))
                                      continue;
                              vs = vget_prep(vp2);
                              rw_wunlock(&vfs_hash_lock);
                              error = vget_finish(vp2, flags, vs);
                              if (error == ENOENT && (flags & LK_NOWAIT) == 0)
                                      break;
                              rw_wlock(&vfs_hash_lock);
                              LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist);
                              rw_wunlock(&vfs_hash_lock);
                              vput(vp);
                              if (!error)
                                      *vpp = vp2;
                              return (error);
                      }
                      if (vp2 == NULL)
                              break;
                              
              }
              vp->v_hash = hash;
   45         LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
              rw_wunlock(&vfs_hash_lock);
              return (0);
      }
      
      void
      vfs_hash_rehash(struct vnode *vp, u_int hash)
      {
      
              rw_wlock(&vfs_hash_lock);
              LIST_REMOVE(vp, v_hashlist);
              LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
              vp->v_hash = hash;
              rw_wunlock(&vfs_hash_lock);
      }
      
      void
      vfs_hash_changesize(u_long newmaxvnodes)
      {
              struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl;
              u_long vfs_hash_newmask, vfs_hash_oldmask;
              struct vnode *vp;
              int i;
      
              vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH,
                      &vfs_hash_newmask);
              /* If same hash table size, nothing to do */
              if (vfs_hash_mask == vfs_hash_newmask) {
                      free(vfs_hash_newtbl, M_VFS_HASH);
                      return;
              }
              /*
               * Move everything from the old hash table to the new table.
               * None of the vnodes in the table can be recycled because to
               * do so, they have to be removed from the hash table.
               */
              rw_wlock(&vfs_hash_lock);
              vfs_hash_oldtbl = vfs_hash_tbl;
              vfs_hash_oldmask = vfs_hash_mask;
              vfs_hash_tbl = vfs_hash_newtbl;
              vfs_hash_mask = vfs_hash_newmask;
              for (i = 0; i <= vfs_hash_oldmask; i++) {
                      while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) {
                              LIST_REMOVE(vp, v_hashlist);
                              LIST_INSERT_HEAD(
                                  vfs_hash_bucket(vp->v_mount, vp->v_hash),
                                  vp, v_hashlist);
                      }
              }
              rw_wunlock(&vfs_hash_lock);
              free(vfs_hash_oldtbl, M_VFS_HASH);
      }
      /*-
       * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
       *
       * Copyright (c) 1991, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * This code is derived from software contributed to Berkeley by
       * The Mach Operating System project at Carnegie-Mellon University.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        from: @(#)vm_kern.c        8.3 (Berkeley) 1/12/94
       *
       *
       * Copyright (c) 1987, 1990 Carnegie-Mellon University.
       * All rights reserved.
       *
       * Authors: Avadis Tevanian, Jr., Michael Wayne Young
       *
       * Permission to use, copy, modify and distribute this software and
       * its documentation is hereby granted, provided that both the copyright
       * notice and this permission notice appear in all copies of the
       * software, derivative works or modified versions, and any portions
       * thereof, and that both notices appear in supporting documentation.
       *
       * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
       * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
       * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
       *
       * Carnegie Mellon requests users of this software to return to
       *
       *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
       *  School of Computer Science
       *  Carnegie Mellon University
       *  Pittsburgh PA 15213-3890
       *
       * any improvements or extensions that they make and grant Carnegie the
       * rights to redistribute these changes.
       */
      
      /*
       *        Kernel memory management.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_vm.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>                /* for ticks and hz */
      #include <sys/domainset.h>
      #include <sys/eventhandler.h>
      #include <sys/lock.h>
      #include <sys/proc.h>
      #include <sys/malloc.h>
      #include <sys/rwlock.h>
      #include <sys/sysctl.h>
      #include <sys/vmem.h>
      #include <sys/vmmeter.h>
      
      #include <vm/vm.h>
      #include <vm/vm_param.h>
      #include <vm/vm_domainset.h>
      #include <vm/vm_kern.h>
      #include <vm/pmap.h>
      #include <vm/vm_map.h>
      #include <vm/vm_object.h>
      #include <vm/vm_page.h>
      #include <vm/vm_pageout.h>
      #include <vm/vm_phys.h>
      #include <vm/vm_pagequeue.h>
      #include <vm/vm_radix.h>
      #include <vm/vm_extern.h>
      #include <vm/uma.h>
      
      vm_map_t kernel_map;
      vm_map_t exec_map;
      vm_map_t pipe_map;
      
      const void *zero_region;
      CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
      
      /* NB: Used by kernel debuggers. */
      const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
      
      u_int exec_map_entry_size;
      u_int exec_map_entries;
      
      SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
          SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
      
      SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
      #if defined(__arm__)
          &vm_max_kernel_address, 0,
      #else
          SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS,
      #endif
          "Max kernel address");
      
      #if VM_NRESERVLEVEL > 0
      #define        KVA_QUANTUM_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
      #else
      /* On non-superpage architectures we want large import sizes. */
      #define        KVA_QUANTUM_SHIFT        (8 + PAGE_SHIFT)
      #endif
      #define        KVA_QUANTUM                (1 << KVA_QUANTUM_SHIFT)
      
      extern void     uma_startup2(void);
      
      /*
       *        kva_alloc:
       *
       *        Allocate a virtual address range with no underlying object and
       *        no initial mapping to physical memory.  Any mapping from this
       *        range to physical memory must be explicitly created prior to
       *        its use, typically with pmap_qenter().  Any attempt to create
       *        a mapping on demand through vm_fault() will result in a panic. 
       */
      vm_offset_t
      kva_alloc(vm_size_t size)
      {
              vm_offset_t addr;
      
              size = round_page(size);
              if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr))
                      return (0);
      
              return (addr);
      }
      
      /*
       *        kva_free:
       *
       *        Release a region of kernel virtual memory allocated
       *        with kva_alloc, and return the physical pages
       *        associated with that region.
       *
       *        This routine may not block on kernel maps.
       */
      void
      kva_free(vm_offset_t addr, vm_size_t size)
      {
      
              size = round_page(size);
              vmem_free(kernel_arena, addr, size);
      }
      
      /*
       *        Allocates a region from the kernel address map and physical pages
       *        within the specified address range to the kernel object.  Creates a
       *        wired mapping from this region to these pages, and returns the
       *        region's starting virtual address.  The allocated pages are not
       *        necessarily physically contiguous.  If M_ZERO is specified through the
       *        given flags, then the pages are zeroed before they are mapped.
       */
      static vm_offset_t
      kmem_alloc_attr_domain(int domain, vm_size_t size, int flags, vm_paddr_t low,
          vm_paddr_t high, vm_memattr_t memattr)
      {
              vmem_t *vmem;
              vm_object_t object = kernel_object;
              vm_offset_t addr, i, offset;
              vm_page_t m;
              int pflags, tries;
              vm_prot_t prot;
      
              size = round_page(size);
              vmem = vm_dom[domain].vmd_kernel_arena;
              if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr))
                      return (0);
              offset = addr - VM_MIN_KERNEL_ADDRESS;
              pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED;
              pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
              pflags |= VM_ALLOC_NOWAIT;
              prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW;
              VM_OBJECT_WLOCK(object);
              for (i = 0; i < size; i += PAGE_SIZE) {
                      tries = 0;
      retry:
                      m = vm_page_alloc_contig_domain(object, atop(offset + i),
                          domain, pflags, 1, low, high, PAGE_SIZE, 0, memattr);
                      if (m == NULL) {
                              VM_OBJECT_WUNLOCK(object);
                              if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
                                      if (!vm_page_reclaim_contig_domain(domain,
                                          pflags, 1, low, high, PAGE_SIZE, 0) &&
                                          (flags & M_WAITOK) != 0)
                                              vm_wait_domain(domain);
                                      VM_OBJECT_WLOCK(object);
                                      tries++;
                                      goto retry;
                              }
                              kmem_unback(object, addr, i);
                              vmem_free(vmem, addr, size);
                              return (0);
                      }
                      KASSERT(vm_phys_domain(m) == domain,
                          ("kmem_alloc_attr_domain: Domain mismatch %d != %d",
                          vm_phys_domain(m), domain));
                      if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
                              pmap_zero_page(m);
                      vm_page_valid(m);
                      pmap_enter(kernel_pmap, addr + i, m, prot,
                          prot | PMAP_ENTER_WIRED, 0);
              }
              VM_OBJECT_WUNLOCK(object);
              return (addr);
      }
      
      vm_offset_t
      kmem_alloc_attr(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high,
          vm_memattr_t memattr)
      {
      
              return (kmem_alloc_attr_domainset(DOMAINSET_RR(), size, flags, low,
                  high, memattr));
      }
      
      vm_offset_t
      kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags,
          vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr)
      {
              struct vm_domainset_iter di;
              vm_offset_t addr;
              int domain;
      
              vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
              do {
                      addr = kmem_alloc_attr_domain(domain, size, flags, low, high,
                          memattr);
                      if (addr != 0)
                              break;
              } while (vm_domainset_iter_policy(&di, &domain) == 0);
      
              return (addr);
      }
      
      /*
       *        Allocates a region from the kernel address map and physically
       *        contiguous pages within the specified address range to the kernel
       *        object.  Creates a wired mapping from this region to these pages, and
       *        returns the region's starting virtual address.  If M_ZERO is specified
       *        through the given flags, then the pages are zeroed before they are
       *        mapped.
       */
      static vm_offset_t
      kmem_alloc_contig_domain(int domain, vm_size_t size, int flags, vm_paddr_t low,
          vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
          vm_memattr_t memattr)
      {
              vmem_t *vmem;
              vm_object_t object = kernel_object;
              vm_offset_t addr, offset, tmp;
              vm_page_t end_m, m;
              u_long npages;
              int pflags, tries;
       
              size = round_page(size);
              vmem = vm_dom[domain].vmd_kernel_arena;
              if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
                      return (0);
              offset = addr - VM_MIN_KERNEL_ADDRESS;
              pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED;
              pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
              pflags |= VM_ALLOC_NOWAIT;
              npages = atop(size);
              VM_OBJECT_WLOCK(object);
              tries = 0;
      retry:
              m = vm_page_alloc_contig_domain(object, atop(offset), domain, pflags,
                  npages, low, high, alignment, boundary, memattr);
              if (m == NULL) {
                      VM_OBJECT_WUNLOCK(object);
                      if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
                              if (!vm_page_reclaim_contig_domain(domain, pflags,
                                  npages, low, high, alignment, boundary) &&
                                  (flags & M_WAITOK) != 0)
                                      vm_wait_domain(domain);
                              VM_OBJECT_WLOCK(object);
                              tries++;
                              goto retry;
                      }
                      vmem_free(vmem, addr, size);
                      return (0);
              }
              KASSERT(vm_phys_domain(m) == domain,
                  ("kmem_alloc_contig_domain: Domain mismatch %d != %d",
                  vm_phys_domain(m), domain));
              end_m = m + npages;
              tmp = addr;
              for (; m < end_m; m++) {
                      if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
                              pmap_zero_page(m);
                      vm_page_valid(m);
                      pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW,
                          VM_PROT_RW | PMAP_ENTER_WIRED, 0);
                      tmp += PAGE_SIZE;
              }
              VM_OBJECT_WUNLOCK(object);
              return (addr);
      }
      
      vm_offset_t
      kmem_alloc_contig(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high,
          u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr)
      {
      
              return (kmem_alloc_contig_domainset(DOMAINSET_RR(), size, flags, low,
                  high, alignment, boundary, memattr));
      }
      
      vm_offset_t
      kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags,
          vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
          vm_memattr_t memattr)
      {
              struct vm_domainset_iter di;
              vm_offset_t addr;
              int domain;
      
              vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
              do {
                      addr = kmem_alloc_contig_domain(domain, size, flags, low, high,
                          alignment, boundary, memattr);
                      if (addr != 0)
                              break;
              } while (vm_domainset_iter_policy(&di, &domain) == 0);
      
              return (addr);
      }
      
      /*
       *        kmem_suballoc:
       *
       *        Allocates a map to manage a subrange
       *        of the kernel virtual address space.
       *
       *        Arguments are as follows:
       *
       *        parent                Map to take range from
       *        min, max        Returned endpoints of map
       *        size                Size of range to find
       *        superpage_align        Request that min is superpage aligned
       */
      vm_map_t
      kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max,
          vm_size_t size, boolean_t superpage_align)
      {
              int ret;
              vm_map_t result;
      
              size = round_page(size);
      
              *min = vm_map_min(parent);
              ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ?
                  VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
                  MAP_ACC_NO_CHARGE);
              if (ret != KERN_SUCCESS)
                      panic("kmem_suballoc: bad status return of %d", ret);
              *max = *min + size;
              result = vm_map_create(vm_map_pmap(parent), *min, *max);
              if (result == NULL)
                      panic("kmem_suballoc: cannot create submap");
              if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS)
                      panic("kmem_suballoc: unable to change range to submap");
              return (result);
      }
      
      /*
       *        kmem_malloc_domain:
       *
       *        Allocate wired-down pages in the kernel's address space.
       */
      static vm_offset_t
      kmem_malloc_domain(int domain, vm_size_t size, int flags)
      {
              vmem_t *arena;
              vm_offset_t addr;
              int rv;
      
      #if VM_NRESERVLEVEL > 0
              if (__predict_true((flags & M_EXEC) == 0))
                      arena = vm_dom[domain].vmd_kernel_arena;
              else
                      arena = vm_dom[domain].vmd_kernel_rwx_arena;
      #else
              arena = vm_dom[domain].vmd_kernel_arena;
      #endif
              size = round_page(size);
              if (vmem_alloc(arena, size, flags | M_BESTFIT, &addr))
                      return (0);
      
              rv = kmem_back_domain(domain, kernel_object, addr, size, flags);
              if (rv != KERN_SUCCESS) {
                      vmem_free(arena, addr, size);
                      return (0);
              }
              return (addr);
      }
      
      vm_offset_t
      kmem_malloc(vm_size_t size, int flags)
      {
      
              return (kmem_malloc_domainset(DOMAINSET_RR(), size, flags));
      }
      
      vm_offset_t
      kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags)
   30 {
              struct vm_domainset_iter di;
              vm_offset_t addr;
              int domain;
      
              vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
              do {
                      addr = kmem_malloc_domain(domain, size, flags);
   30                 if (addr != 0)
                              break;
              } while (vm_domainset_iter_policy(&di, &domain) == 0);
      
              return (addr);
      }
      
      /*
       *        kmem_back_domain:
       *
       *        Allocate physical pages from the specified domain for the specified
       *        virtual address range.
       */
      int
      kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr,
          vm_size_t size, int flags)
   30 {
              vm_offset_t offset, i;
              vm_page_t m, mpred;
              vm_prot_t prot;
              int pflags;
      
              KASSERT(object == kernel_object,
                  ("kmem_back_domain: only supports kernel object."));
      
              offset = addr - VM_MIN_KERNEL_ADDRESS;
              pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED;
              pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
              if (flags & M_WAITOK)
                      pflags |= VM_ALLOC_WAITFAIL;
              prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW;
      
              i = 0;
              VM_OBJECT_WLOCK(object);
      retry:
              mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i));
   30         for (; i < size; i += PAGE_SIZE, mpred = m) {
                      m = vm_page_alloc_domain_after(object, atop(offset + i),
                          domain, pflags, mpred);
      
                      /*
                       * Ran out of space, free everything up and return. Don't need
                       * to lock page queues here as we know that the pages we got
                       * aren't on any queues.
                       */
                      if (m == NULL) {
                              if ((flags & M_NOWAIT) == 0)
                                      goto retry;
                              VM_OBJECT_WUNLOCK(object);
                              kmem_unback(object, addr, i);
                              return (KERN_NO_SPACE);
                      }
                      KASSERT(vm_phys_domain(m) == domain,
                          ("kmem_back_domain: Domain mismatch %d != %d",
                          vm_phys_domain(m), domain));
   28                 if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
    2                         pmap_zero_page(m);
                      KASSERT((m->oflags & VPO_UNMANAGED) != 0,
                          ("kmem_malloc: page %p is managed", m));
                      vm_page_valid(m);
                      pmap_enter(kernel_pmap, addr + i, m, prot,
                          prot | PMAP_ENTER_WIRED, 0);
      #if VM_NRESERVLEVEL > 0
   30                 if (__predict_false((prot & VM_PROT_EXECUTE) != 0))
                              m->oflags |= VPO_KMEM_EXEC;
      #endif
              }
              VM_OBJECT_WUNLOCK(object);
      
              return (KERN_SUCCESS);
      }
      
      /*
       *        kmem_back:
       *
       *        Allocate physical pages for the specified virtual address range.
       */
      int
      kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
      {
              vm_offset_t end, next, start;
              int domain, rv;
      
              KASSERT(object == kernel_object,
                  ("kmem_back: only supports kernel object."));
      
              for (start = addr, end = addr + size; addr < end; addr = next) {
                      /*
                       * We must ensure that pages backing a given large virtual page
                       * all come from the same physical domain.
                       */
                      if (vm_ndomains > 1) {
                              domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains;
                              while (VM_DOMAIN_EMPTY(domain))
                                      domain++;
                              next = roundup2(addr + 1, KVA_QUANTUM);
                              if (next > end || next < start)
                                      next = end;
                      } else {
                              domain = 0;
                              next = end;
                      }
                      rv = kmem_back_domain(domain, object, addr, next - addr, flags);
                      if (rv != KERN_SUCCESS) {
                              kmem_unback(object, start, addr - start);
                              break;
                      }
              }
              return (rv);
      }
      
      /*
       *        kmem_unback:
       *
       *        Unmap and free the physical pages underlying the specified virtual
       *        address range.
       *
       *        A physical page must exist within the specified object at each index
       *        that is being unmapped.
       */
      static struct vmem *
      _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
    2 {
              struct vmem *arena;
              vm_page_t m, next;
              vm_offset_t end, offset;
              int domain;
      
              KASSERT(object == kernel_object,
                  ("kmem_unback: only supports kernel object."));
      
              if (size == 0)
                      return (NULL);
              pmap_remove(kernel_pmap, addr, addr + size);
              offset = addr - VM_MIN_KERNEL_ADDRESS;
              end = offset + size;
              VM_OBJECT_WLOCK(object);
              m = vm_page_lookup(object, atop(offset)); 
              domain = vm_phys_domain(m);
      #if VM_NRESERVLEVEL > 0
              if (__predict_true((m->oflags & VPO_KMEM_EXEC) == 0))
                      arena = vm_dom[domain].vmd_kernel_arena;
              else
                      arena = vm_dom[domain].vmd_kernel_rwx_arena;
      #else
              arena = vm_dom[domain].vmd_kernel_arena;
      #endif
    2         for (; offset < end; offset += PAGE_SIZE, m = next) {
                      next = vm_page_next(m);
                      vm_page_xbusy_claim(m);
                      vm_page_unwire_noq(m);
                      vm_page_free(m);
              }
              VM_OBJECT_WUNLOCK(object);
      
              return (arena);
      }
      
      void
      kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
      {
      
              (void)_kmem_unback(object, addr, size);
      }
      
      /*
       *        kmem_free:
       *
       *        Free memory allocated with kmem_malloc.  The size must match the
       *        original allocation.
       */
      void
      kmem_free(vm_offset_t addr, vm_size_t size)
    2 {
              struct vmem *arena;
      
              size = round_page(size);
              arena = _kmem_unback(kernel_object, addr, size);
              if (arena != NULL)
    1                 vmem_free(arena, addr, size);
      }
      
      /*
       *        kmap_alloc_wait:
       *
       *        Allocates pageable memory from a sub-map of the kernel.  If the submap
       *        has no room, the caller sleeps waiting for more memory in the submap.
       *
       *        This routine may block.
       */
      vm_offset_t
      kmap_alloc_wait(vm_map_t map, vm_size_t size)
      {
              vm_offset_t addr;
      
              size = round_page(size);
              if (!swap_reserve(size))
                      return (0);
      
              for (;;) {
                      /*
                       * To make this work for more than one map, use the map's lock
                       * to lock out sleepers/wakers.
                       */
                      vm_map_lock(map);
                      addr = vm_map_findspace(map, vm_map_min(map), size);
                      if (addr + size <= vm_map_max(map))
                              break;
                      /* no space now; see if we can ever get space */
                      if (vm_map_max(map) - vm_map_min(map) < size) {
                              vm_map_unlock(map);
                              swap_release(size);
                              return (0);
                      }
                      map->needs_wakeup = TRUE;
                      vm_map_unlock_and_wait(map, 0);
              }
              vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW,
                  MAP_ACC_CHARGED);
              vm_map_unlock(map);
              return (addr);
      }
      
      /*
       *        kmap_free_wakeup:
       *
       *        Returns memory to a submap of the kernel, and wakes up any processes
       *        waiting for memory in that map.
       */
      void
      kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size)
      {
      
              vm_map_lock(map);
              (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size));
              if (map->needs_wakeup) {
                      map->needs_wakeup = FALSE;
                      vm_map_wakeup(map);
              }
              vm_map_unlock(map);
      }
      
      void
      kmem_init_zero_region(void)
      {
              vm_offset_t addr, i;
              vm_page_t m;
      
              /*
               * Map a single physical page of zeros to a larger virtual range.
               * This requires less looping in places that want large amounts of
               * zeros, while not using much more physical resources.
               */
              addr = kva_alloc(ZERO_REGION_SIZE);
              m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
                  VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
              if ((m->flags & PG_ZERO) == 0)
                      pmap_zero_page(m);
              for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE)
                      pmap_qenter(addr + i, &m, 1);
              pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ);
      
              zero_region = (const void *)addr;
      }
      
      /*
       * Import KVA from the kernel map into the kernel arena.
       */
      static int
      kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp)
      {
              vm_offset_t addr;
              int result;
      
              KASSERT((size % KVA_QUANTUM) == 0,
                  ("kva_import: Size %jd is not a multiple of %d",
                  (intmax_t)size, (int)KVA_QUANTUM));
              addr = vm_map_min(kernel_map);
              result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0,
                  VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
              if (result != KERN_SUCCESS)
                      return (ENOMEM);
      
              *addrp = addr;
      
              return (0);
      }
      
      /*
       * Import KVA from a parent arena into a per-domain arena.  Imports must be
       * KVA_QUANTUM-aligned and a multiple of KVA_QUANTUM in size.
       */
      static int
      kva_import_domain(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp)
      {
      
              KASSERT((size % KVA_QUANTUM) == 0,
                  ("kva_import_domain: Size %jd is not a multiple of %d",
                  (intmax_t)size, (int)KVA_QUANTUM));
              return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN,
                  VMEM_ADDR_MAX, flags, addrp));
      }
      
      /*
       *         kmem_init:
       *
       *        Create the kernel map; insert a mapping covering kernel text, 
       *        data, bss, and all space allocated thus far (`boostrap' data).  The 
       *        new map will thus map the range between VM_MIN_KERNEL_ADDRESS and 
       *        `start' as allocated, and the range between `start' and `end' as free.
       *        Create the kernel vmem arena and its per-domain children.
       */
      void
      kmem_init(vm_offset_t start, vm_offset_t end)
      {
              vm_map_t m;
              int domain;
      
              m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end);
              m->system_map = 1;
              vm_map_lock(m);
              /* N.B.: cannot use kgdb to debug, starting with this assignment ... */
              kernel_map = m;
              (void)vm_map_insert(m, NULL, 0,
      #ifdef __amd64__
                  KERNBASE,
      #else                     
                  VM_MIN_KERNEL_ADDRESS,
      #endif
                  start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
              /* ... and ending with the completion of the above `insert' */
      
      #ifdef __amd64__
              /*
               * Mark KVA used for the page array as allocated.  Other platforms
               * that handle vm_page_array allocation can simply adjust virtual_avail
               * instead.
               */
              (void)vm_map_insert(m, NULL, 0, (vm_offset_t)vm_page_array,
                  (vm_offset_t)vm_page_array + round_2mpage(vm_page_array_size *
                  sizeof(struct vm_page)),
                  VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
      #endif
              vm_map_unlock(m);
      
              /*
               * Initialize the kernel_arena.  This can grow on demand.
               */
              vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0);
              vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM);
      
              for (domain = 0; domain < vm_ndomains; domain++) {
                      /*
                       * Initialize the per-domain arenas.  These are used to color
                       * the KVA space in a way that ensures that virtual large pages
                       * are backed by memory from the same physical domain,
                       * maximizing the potential for superpage promotion.
                       */
                      vm_dom[domain].vmd_kernel_arena = vmem_create(
                          "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK);
                      vmem_set_import(vm_dom[domain].vmd_kernel_arena,
                          kva_import_domain, NULL, kernel_arena, KVA_QUANTUM);
      
                      /*
                       * In architectures with superpages, maintain separate arenas
                       * for allocations with permissions that differ from the
                       * "standard" read/write permissions used for kernel memory,
                       * so as not to inhibit superpage promotion.
                       */
      #if VM_NRESERVLEVEL > 0
                      vm_dom[domain].vmd_kernel_rwx_arena = vmem_create(
                          "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK);
                      vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena,
                          kva_import_domain, (vmem_release_t *)vmem_xfree,
                          kernel_arena, KVA_QUANTUM);
      #endif
              }
      
              /*
               * This must be the very first call so that the virtual address
               * space used for early allocations is properly marked used in
               * the map.
               */
              uma_startup2();
      }
      
      /*
       *        kmem_bootstrap_free:
       *
       *        Free pages backing preloaded data (e.g., kernel modules) to the
       *        system.  Currently only supported on platforms that create a
       *        vm_phys segment for preloaded data.
       */
      void
      kmem_bootstrap_free(vm_offset_t start, vm_size_t size)
      {
      #if defined(__i386__) || defined(__amd64__)
              struct vm_domain *vmd;
              vm_offset_t end, va;
              vm_paddr_t pa;
              vm_page_t m;
      
              end = trunc_page(start + size);
              start = round_page(start);
      
      #ifdef __amd64__
              /*
               * Preloaded files do not have execute permissions by default on amd64.
               * Restore the default permissions to ensure that the direct map alias
               * is updated.
               */
              pmap_change_prot(start, end - start, VM_PROT_RW);
      #endif
              for (va = start; va < end; va += PAGE_SIZE) {
                      pa = pmap_kextract(va);
                      m = PHYS_TO_VM_PAGE(pa);
      
                      vmd = vm_pagequeue_domain(m);
                      vm_domain_free_lock(vmd);
                      vm_phys_free_pages(m, 0);
                      vm_domain_free_unlock(vmd);
      
                      vm_domain_freecnt_inc(vmd, 1);
                      vm_cnt.v_page_count++;
              }
              pmap_remove(kernel_pmap, start, end);
              (void)vmem_add(kernel_arena, start, end - start, M_WAITOK);
      #endif
      }
      
      /*
       * Allow userspace to directly trigger the VM drain routine for testing
       * purposes.
       */
      static int
      debug_vm_lowmem(SYSCTL_HANDLER_ARGS)
      {
              int error, i;
      
              i = 0;
              error = sysctl_handle_int(oidp, &i, 0, req);
              if (error)
                      return (error);
              if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0)
                      return (EINVAL);
              if (i != 0)
                      EVENTHANDLER_INVOKE(vm_lowmem, i);
              return (0);
      }
      
      SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, 0,
          debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags");
      /*-
       * SPDX-License-Identifier: BSD-3-Clause
       *
       * Copyright (c) 1982, 1986, 1988, 1990, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)ip_output.c        8.3 (Berkeley) 1/21/94
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_inet.h"
      #include "opt_ipsec.h"
      #include "opt_kern_tls.h"
      #include "opt_mbuf_stress_test.h"
      #include "opt_mpath.h"
      #include "opt_ratelimit.h"
      #include "opt_route.h"
      #include "opt_rss.h"
      #include "opt_sctp.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/ktls.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mbuf.h>
      #include <sys/priv.h>
      #include <sys/proc.h>
      #include <sys/protosw.h>
      #include <sys/rmlock.h>
      #include <sys/sdt.h>
      #include <sys/socket.h>
      #include <sys/socketvar.h>
      #include <sys/sysctl.h>
      #include <sys/ucred.h>
      
      #include <net/if.h>
      #include <net/if_var.h>
      #include <net/if_llatbl.h>
      #include <net/netisr.h>
      #include <net/pfil.h>
      #include <net/route.h>
      #ifdef RADIX_MPATH
      #include <net/radix_mpath.h>
      #endif
      #include <net/rss_config.h>
      #include <net/vnet.h>
      
      #include <netinet/in.h>
      #include <netinet/in_fib.h>
      #include <netinet/in_kdtrace.h>
      #include <netinet/in_systm.h>
      #include <netinet/ip.h>
      #include <netinet/in_pcb.h>
      #include <netinet/in_rss.h>
      #include <netinet/in_var.h>
      #include <netinet/ip_var.h>
      #include <netinet/ip_options.h>
      
      #include <netinet/udp.h>
      #include <netinet/udp_var.h>
      
      #ifdef SCTP
      #include <netinet/sctp.h>
      #include <netinet/sctp_crc32.h>
      #endif
      
      #include <netipsec/ipsec_support.h>
      
      #include <machine/in_cksum.h>
      
      #include <security/mac/mac_framework.h>
      
      #ifdef MBUF_STRESS_TEST
      static int mbuf_frag_size = 0;
      SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
              &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
      #endif
      
      static void        ip_mloopback(struct ifnet *, const struct mbuf *, int);
      
      
      extern int in_mcast_loop;
      extern        struct protosw inetsw[];
      
      static inline int
      ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
          struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
      {
              struct m_tag *fwd_tag = NULL;
              struct mbuf *m;
              struct in_addr odst;
              struct ip *ip;
              int pflags = PFIL_OUT;
      
              if (flags & IP_FORWARDING)
                      pflags |= PFIL_FWD;
      
              m = *mp;
              ip = mtod(m, struct ip *);
      
              /* Run through list of hooks for output packets. */
              odst.s_addr = ip->ip_dst.s_addr;
              switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) {
              case PFIL_DROPPED:
                      *error = EACCES;
                      /* FALLTHROUGH */
              case PFIL_CONSUMED:
                      return 1; /* Finished */
              case PFIL_PASS:
                      *error = 0;
              }
              m = *mp;
              ip = mtod(m, struct ip *);
      
              /* See if destination IP address was changed by packet filter. */
              if (odst.s_addr != ip->ip_dst.s_addr) {
                      m->m_flags |= M_SKIP_FIREWALL;
                      /* If destination is now ourself drop to ip_input(). */
                      if (in_localip(ip->ip_dst)) {
                              m->m_flags |= M_FASTFWD_OURS;
                              if (m->m_pkthdr.rcvif == NULL)
                                      m->m_pkthdr.rcvif = V_loif;
                              if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
                                      m->m_pkthdr.csum_flags |=
                                              CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
                                      m->m_pkthdr.csum_data = 0xffff;
                              }
                              m->m_pkthdr.csum_flags |=
                                      CSUM_IP_CHECKED | CSUM_IP_VALID;
      #ifdef SCTP
                              if (m->m_pkthdr.csum_flags & CSUM_SCTP)
                                      m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
      #endif
                              *error = netisr_queue(NETISR_IP, m);
                              return 1; /* Finished */
                      }
      
                      bzero(dst, sizeof(*dst));
                      dst->sin_family = AF_INET;
                      dst->sin_len = sizeof(*dst);
                      dst->sin_addr = ip->ip_dst;
      
                      return -1; /* Reloop */
              }
              /* See if fib was changed by packet filter. */
              if ((*fibnum) != M_GETFIB(m)) {
                      m->m_flags |= M_SKIP_FIREWALL;
                      *fibnum = M_GETFIB(m);
                      return -1; /* Reloop for FIB change */
              }
      
              /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
              if (m->m_flags & M_FASTFWD_OURS) {
                      if (m->m_pkthdr.rcvif == NULL)
                              m->m_pkthdr.rcvif = V_loif;
                      if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
                              m->m_pkthdr.csum_flags |=
                                      CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
                              m->m_pkthdr.csum_data = 0xffff;
                      }
      #ifdef SCTP
                      if (m->m_pkthdr.csum_flags & CSUM_SCTP)
                              m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
      #endif
                      m->m_pkthdr.csum_flags |=
                              CSUM_IP_CHECKED | CSUM_IP_VALID;
      
                      *error = netisr_queue(NETISR_IP, m);
                      return 1; /* Finished */
              }
              /* Or forward to some other address? */
              if ((m->m_flags & M_IP_NEXTHOP) &&
                  ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
                      bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
                      m->m_flags |= M_SKIP_FIREWALL;
                      m->m_flags &= ~M_IP_NEXTHOP;
                      m_tag_delete(m, fwd_tag);
      
                      return -1; /* Reloop for CHANGE of dst */
              }
      
              return 0;
      }
      
      static int
      ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
          const struct sockaddr_in *gw, struct route *ro, bool stamp_tag)
      {
      #ifdef KERN_TLS
              struct ktls_session *tls = NULL;
      #endif
              struct m_snd_tag *mst;
              int error;
      
              MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
              mst = NULL;
      
      #ifdef KERN_TLS
              /*
               * If this is an unencrypted TLS record, save a reference to
               * the record.  This local reference is used to call
               * ktls_output_eagain after the mbuf has been freed (thus
               * dropping the mbuf's reference) in if_output.
               */
              if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
                      tls = ktls_hold(m->m_next->m_ext.ext_pgs->tls);
                      mst = tls->snd_tag;
      
                      /*
                       * If a TLS session doesn't have a valid tag, it must
                       * have had an earlier ifp mismatch, so drop this
                       * packet.
                       */
                      if (mst == NULL) {
                              error = EAGAIN;
                              goto done;
                      }
              }
      #endif
      #ifdef RATELIMIT
              if (inp != NULL && mst == NULL) {
                      if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
                          (inp->inp_snd_tag != NULL &&
                          inp->inp_snd_tag->ifp != ifp))
                              in_pcboutput_txrtlmt(inp, ifp, m);
      
                      if (inp->inp_snd_tag != NULL)
                              mst = inp->inp_snd_tag;
              }
      #endif
              if (stamp_tag && mst != NULL) {
                      KASSERT(m->m_pkthdr.rcvif == NULL,
                          ("trying to add a send tag to a forwarded packet"));
                      if (mst->ifp != ifp) {
                              error = EAGAIN;
                              goto done;
                      }
      
                      /* stamp send tag on mbuf */
                      m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
                      m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
              }
      
              error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro);
      
      done:
              /* Check for route change invalidating send tags. */
      #ifdef KERN_TLS
              if (tls != NULL) {
                      if (error == EAGAIN)
                              error = ktls_output_eagain(inp, tls);
                      ktls_free(tls);
              }
      #endif
      #ifdef RATELIMIT
              if (error == EAGAIN)
                      in_pcboutput_eagain(inp);
      #endif
              return (error);
      }
      
      /*
       * IP output.  The packet in mbuf chain m contains a skeletal IP
       * header (with len, off, ttl, proto, tos, src, dst).
       * The mbuf chain containing the packet will be freed.
       * The mbuf opt, if present, will not be freed.
       * If route ro is present and has ro_rt initialized, route lookup would be
       * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
       * then result of route lookup is stored in ro->ro_rt.
       *
       * In the IP forwarding case, the packet will arrive with options already
       * inserted, so must have a NULL opt pointer.
       */
      int
      ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
          struct ip_moptions *imo, struct inpcb *inp)
 1536 {
              struct rm_priotracker in_ifa_tracker;
              struct ip *ip;
              struct ifnet *ifp = NULL;        /* keep compiler happy */
              struct mbuf *m0;
              int hlen = sizeof (struct ip);
              int mtu;
              int error = 0;
              struct sockaddr_in *dst, sin;
              const struct sockaddr_in *gw;
              struct in_ifaddr *ia;
              struct in_addr src;
              int isbroadcast;
              uint16_t ip_len, ip_off;
              uint32_t fibnum;
      #if defined(IPSEC) || defined(IPSEC_SUPPORT)
              int no_route_but_check_spd = 0;
      #endif
      
              M_ASSERTPKTHDR(m);
              NET_EPOCH_ASSERT();
      
  507         if (inp != NULL) {
                      INP_LOCK_ASSERT(inp);
                      M_SETFIB(m, inp->inp_inc.inc_fibnum);
                      if ((flags & IP_NODEFAULTFLOWID) == 0) {
 1029                         m->m_pkthdr.flowid = inp->inp_flowid;
                              M_HASHTYPE_SET(m, inp->inp_flowtype);
                      }
      #ifdef NUMA
                      m->m_pkthdr.numa_domain = inp->inp_numa_domain;
      #endif
              }
      
 1529         if (opt) {
    7                 int len = 0;
                      m = ip_insertoptions(m, opt, &len);
                      if (len != 0)
                              hlen = len; /* ip->ip_hl is updated above */
              }
              ip = mtod(m, struct ip *);
              ip_len = ntohs(ip->ip_len);
              ip_off = ntohs(ip->ip_off);
      
              if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
                      ip->ip_v = IPVERSION;
 1034                 ip->ip_hl = hlen >> 2;
                      ip_fillid(ip);
              } else {
                      /* Header already set, fetch hlen from there */
                      hlen = ip->ip_hl << 2;
              }
              if ((flags & IP_FORWARDING) == 0)
                      IPSTAT_INC(ips_localout);
      
              /*
               * dst/gw handling:
               *
               * gw is readonly but can point either to dst OR rt_gateway,
               * therefore we need restore gw if we're redoing lookup.
               */
 1029         fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
              if (ro != NULL)
                      dst = (struct sockaddr_in *)&ro->ro_dst;
              else
                      dst = &sin;
 1536         if (ro == NULL || ro->ro_rt == NULL) {
                      bzero(dst, sizeof(*dst));
                      dst->sin_family = AF_INET;
                      dst->sin_len = sizeof(*dst);
                      dst->sin_addr = ip->ip_dst;
              }
              gw = dst;
      again:
              /*
               * Validate route against routing table additions;
               * a better/more specific route might have been added.
               */
 1520         if (inp != NULL && ro != NULL && ro->ro_rt != NULL)
   59                 RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
              /*
               * If there is a cached route,
               * check that it is to the same destination
               * and is still up.  If not, free it and try again.
               * The address family should also be checked in case of sharing the
               * cache with IPv6.
               * Also check whether routing cache needs invalidation.
               */
 1536         if (ro != NULL && ro->ro_rt != NULL &&
                  ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
  548             ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) ||
                  dst->sin_family != AF_INET ||
                  dst->sin_addr.s_addr != ip->ip_dst.s_addr))
    4                 RO_INVALIDATE_CACHE(ro);
              ia = NULL;
              /*
               * If routing to interface only, short circuit routing lookup.
               * The use of an all-ones broadcast address implies this; an
               * interface is specified by the broadcast address of an interface,
               * or the destination address of a ptp interface.
               */
              if (flags & IP_SENDONES) {
                      if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
                                                            M_GETFIB(m)))) == NULL &&
                          (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
                                                          M_GETFIB(m)))) == NULL) {
                              IPSTAT_INC(ips_noroute);
                              error = ENETUNREACH;
                              goto bad;
                      }
                      ip->ip_dst.s_addr = INADDR_BROADCAST;
                      dst->sin_addr = ip->ip_dst;
                      ifp = ia->ia_ifp;
                      mtu = ifp->if_mtu;
                      ip->ip_ttl = 1;
                      isbroadcast = 1;
                      src = IA_SIN(ia)->sin_addr;
              } else if (flags & IP_ROUTETOIF) {
    3                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
                                                          M_GETFIB(m)))) == NULL &&
                          (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
                                                      M_GETFIB(m)))) == NULL) {
                              IPSTAT_INC(ips_noroute);
                              error = ENETUNREACH;
                              goto bad;
                      }
                      ifp = ia->ia_ifp;
                      mtu = ifp->if_mtu;
                      ip->ip_ttl = 1;
    1                 isbroadcast = ifp->if_flags & IFF_BROADCAST ?
    2                     in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
                      src = IA_SIN(ia)->sin_addr;
 1530         } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
                  imo != NULL && imo->imo_multicast_ifp != NULL) {
                      /*
                       * Bypass the normal routing lookup for multicast
                       * packets if the interface is specified.
                       */
                      ifp = imo->imo_multicast_ifp;
                      mtu = ifp->if_mtu;
    2                 IFP_TO_IA(ifp, ia, &in_ifa_tracker);
                      isbroadcast = 0;        /* fool gcc */
                      /* Interface may have no addresses. */
                      if (ia != NULL)
                              src = IA_SIN(ia)->sin_addr;
                      else
                              src.s_addr = INADDR_ANY;
              } else if (ro != NULL) {
                      if (ro->ro_rt == NULL) {
                              /*
                               * We want to do any cloning requested by the link
                               * layer, as this is probably required in all cases
                               * for correct operation (as it is for ARP).
                               */
      #ifdef RADIX_MPATH
                              rtalloc_mpath_fib(ro,
                                  ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
                                  fibnum);
      #else
                              in_rtalloc_ign(ro, 0, fibnum);
      #endif
 1009                         if (ro->ro_rt == NULL ||
                                  (ro->ro_rt->rt_flags & RTF_UP) == 0 ||
                                  ro->ro_rt->rt_ifp == NULL ||
                                  !RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) {
      #if defined(IPSEC) || defined(IPSEC_SUPPORT)
                                      /*
                                       * There is no route for this packet, but it is
                                       * possible that a matching SPD entry exists.
                                       */
                                      no_route_but_check_spd = 1;
                                      mtu = 0; /* Silence GCC warning. */
                                      goto sendit;
      #endif
                                      IPSTAT_INC(ips_noroute);
                                      error = EHOSTUNREACH;
                                      goto bad;
                              }
                      }
                      ia = ifatoia(ro->ro_rt->rt_ifa);
                      ifp = ro->ro_rt->rt_ifp;
                      counter_u64_add(ro->ro_rt->rt_pksent, 1);
                      rt_update_ro_flags(ro);
  601                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
  911                         gw = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
                      if (ro->ro_rt->rt_flags & RTF_HOST)
   12                         isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
                      else if (ifp->if_flags & IFF_BROADCAST)
 1500                         isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
                      else
                              isbroadcast = 0;
                      if (ro->ro_rt->rt_flags & RTF_HOST)
   12                         mtu = ro->ro_rt->rt_mtu;
                      else
 1500                         mtu = ifp->if_mtu;
                      src = IA_SIN(ia)->sin_addr;
              } else {
                      struct nhop4_extended nh;
      
                      bzero(&nh, sizeof(nh));
                      if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) !=
                          0) {
      #if defined(IPSEC) || defined(IPSEC_SUPPORT)
                              /*
                               * There is no route for this packet, but it is
                               * possible that a matching SPD entry exists.
                               */
                              no_route_but_check_spd = 1;
                              mtu = 0; /* Silence GCC warning. */
                              goto sendit;
      #endif
                              IPSTAT_INC(ips_noroute);
                              error = EHOSTUNREACH;
                              goto bad;
                      }
                      ifp = nh.nh_ifp;
                      mtu = nh.nh_mtu;
                      /*
                       * We are rewriting here dst to be gw actually, contradicting
                       * comment at the beginning of the function. However, in this
                       * case we are always dealing with on stack dst.
                       * In case if pfil(9) sends us back to beginning of the
                       * function, the dst would be rewritten by ip_output_pfil().
                       */
                      MPASS(dst == &sin);
                      dst->sin_addr = nh.nh_addr;
                      ia = nh.nh_ia;
                      src = nh.nh_src;
                      isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
                          (NHF_HOST | NHF_BROADCAST)) ||
    1                     ((ifp->if_flags & IFF_BROADCAST) &&
   17                     in_ifaddr_broadcast(dst->sin_addr, ia)));
              }
      
              /* Catch a possible divide by zero later. */
              KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (rt_flags=0x%08x) ifp=%p",
                  __func__, mtu, ro,
                  (ro != NULL && ro->ro_rt != NULL) ? ro->ro_rt->rt_flags : 0, ifp));
      
              if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
                      m->m_flags |= M_MCAST;
                      /*
                       * IP destination address is multicast.  Make sure "gw"
                       * still points to the address in "ro".  (It may have been
                       * changed to point to a gateway address, above.)
                       */
                      gw = dst;
                      /*
                       * See if the caller provided any multicast options
                       */
                      if (imo != NULL) {
                              ip->ip_ttl = imo->imo_multicast_ttl;
    2                         if (imo->imo_multicast_vif != -1)
                                      ip->ip_src.s_addr =
                                          ip_mcast_src ?
                                          ip_mcast_src(imo->imo_multicast_vif) :
                                          INADDR_ANY;
                      } else
    7                         ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
                      /*
                       * Confirm that the outgoing interface supports multicast.
                       */
                      if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
    9                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
                                      IPSTAT_INC(ips_noroute);
                                      error = ENETUNREACH;
                                      goto bad;
                              }
                      }
                      /*
                       * If source address not specified yet, use address
                       * of outgoing interface.
                       */
    7                 if (ip->ip_src.s_addr == INADDR_ANY)
    2                         ip->ip_src = src;
      
    9                 if ((imo == NULL && in_mcast_loop) ||
                          (imo && imo->imo_multicast_loop)) {
                              /*
                               * Loop back multicast datagram if not expressly
                               * forbidden to do so, even if we are not a member
                               * of the group; ip_input() will filter it later,
                               * thus deferring a hash lookup and mutex acquisition
                               * at the expense of a cheap copy using m_copym().
                               */
                              ip_mloopback(ifp, m, hlen);
                      } else {
                              /*
                               * If we are acting as a multicast router, perform
                               * multicast forwarding as if the packet had just
                               * arrived on the interface to which we are about
                               * to send.  The multicast forwarding function
                               * recursively calls this function, using the
                               * IP_FORWARDING flag to prevent infinite recursion.
                               *
                               * Multicasts that are looped back by ip_mloopback(),
                               * above, will be forwarded by the ip_input() routine,
                               * if necessary.
                               */
                              if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
                                      /*
                                       * If rsvp daemon is not running, do not
                                       * set ip_moptions. This ensures that the packet
                                       * is multicast and not just sent down one link
                                       * as prescribed by rsvpd.
                                       */
                                      if (!V_rsvp_on)
                                              imo = NULL;
                                      if (ip_mforward &&
                                          ip_mforward(ip, ifp, m, imo) != 0) {
                                              m_freem(m);
                                              goto done;
                                      }
                              }
                      }
      
                      /*
                       * Multicasts with a time-to-live of zero may be looped-
                       * back, above, but must not be transmitted on a network.
                       * Also, multicasts addressed to the loopback interface
                       * are not sent -- the above call to ip_mloopback() will
                       * loop back a copy. ip_input() will drop the copy if
                       * this host does not belong to the destination group on
                       * the loopback interface.
                       */
    9                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
                              m_freem(m);
                              goto done;
                      }
      
                      goto sendit;
              }
      
              /*
               * If the source address is not specified yet, use the address
               * of the outoing interface.
               */
 1455         if (ip->ip_src.s_addr == INADDR_ANY)
   71                 ip->ip_src = src;
      
              /*
               * Look for broadcast address and
               * verify user is allowed to send
               * such a packet.
               */
              if (isbroadcast) {
    3                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
                              error = EADDRNOTAVAIL;
                              goto bad;
                      }
                      if ((flags & IP_ALLOWBROADCAST) == 0) {
                              error = EACCES;
                              goto bad;
                      }
                      /* don't allow broadcast messages to be fragmented */
                      if (ip_len > mtu) {
                              error = EMSGSIZE;
                              goto bad;
                      }
    2                 m->m_flags |= M_BCAST;
              } else {
 1521                 m->m_flags &= ~M_BCAST;
              }
      
      sendit:
      #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 1530         if (IPSEC_ENABLED(ipv4)) {
                      if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
                              if (error == EINPROGRESS)
                                      error = 0;
                              goto done;
                      }
              }
              /*
               * Check if there was a route for this packet; return error if not.
               */
              if (no_route_but_check_spd) {
                      IPSTAT_INC(ips_noroute);
                      error = EHOSTUNREACH;
                      goto bad;
              }
              /* Update variables that are affected by ipsec4_output(). */
              ip = mtod(m, struct ip *);
              hlen = ip->ip_hl << 2;
      #endif /* IPSEC */
      
              /* Jump over all PFIL processing if hooks are not active. */
              if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
                      switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
                          &error)) {
                      case 1: /* Finished */
                              goto done;
      
                      case 0: /* Continue normally */
                              ip = mtod(m, struct ip *);
                              break;
      
                      case -1: /* Need to try again */
                              /* Reset everything for a new round */
                              if (ro != NULL) {
                                      RO_RTFREE(ro);
                                      ro->ro_prepend = NULL;
                              }
                              gw = dst;
                              ip = mtod(m, struct ip *);
                              goto again;
      
                      }
              }
      
              /* IN_LOOPBACK must not appear on the wire - RFC1122. */
 1530         if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
                  IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
   11                 if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
                              IPSTAT_INC(ips_badaddr);
                              error = EADDRNOTAVAIL;
                              goto bad;
                      }
              }
      
              m->m_pkthdr.csum_flags |= CSUM_IP;
              if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
                      m = mb_unmapped_to_ext(m);
                      if (m == NULL) {
                              IPSTAT_INC(ips_odropped);
                              error = ENOBUFS;
                              goto bad;
                      }
   95                 in_delayed_cksum(m);
                      m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
              } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
                      m = mb_unmapped_to_ext(m);
                      if (m == NULL) {
                              IPSTAT_INC(ips_odropped);
                              error = ENOBUFS;
                              goto bad;
                      }
              }
      #ifdef SCTP
 1030         if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
                      m = mb_unmapped_to_ext(m);
                      if (m == NULL) {
                              IPSTAT_INC(ips_odropped);
                              error = ENOBUFS;
                              goto bad;
                      }
  499                 sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
                      m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
              }
      #endif
      
              /*
               * If small enough for interface, or the interface will take
               * care of the fragmentation for us, we can just send directly.
               */
 1521         if (ip_len <= mtu ||
                  (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
                      ip->ip_sum = 0;
   12                 if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 1509                         ip->ip_sum = in_cksum(m, hlen);
                              m->m_pkthdr.csum_flags &= ~CSUM_IP;
                      }
      
                      /*
                       * Record statistics for this interface address.
                       * With CSUM_TSO the byte/packet count will be slightly
                       * incorrect because we count the IP+TCP headers only
                       * once instead of for every generated packet.
                       */
                      if (!(flags & IP_FORWARDING) && ia) {
                              if (m->m_pkthdr.csum_flags & CSUM_TSO)
                                      counter_u64_add(ia->ia_ifa.ifa_opackets,
    1                                     m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
                              else
                                      counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
      
 1521                         counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
                      }
      #ifdef MBUF_STRESS_TEST
                      if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
                              m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
      #endif
                      /*
                       * Reset layer specific mbuf flags
                       * to avoid confusing lower layers.
                       */
                      m_clrprotoflags(m);
 1521                 IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
                      error = ip_output_send(inp, ifp, m, gw, ro,
                          (flags & IP_NO_SND_TAG_RL) ? false : true);
                      goto done;
              }
      
              /* Balk when DF bit is set or the interface didn't support TSO. */
              if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
                      error = EMSGSIZE;
                      IPSTAT_INC(ips_cantfrag);
                      goto bad;
              }
      
              /*
               * Too large for interface; fragment if possible. If successful,
               * on return, m will point to a list of packets to be sent.
               */
              error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
              if (error)
                      goto bad;
    8         for (; m; m = m0) {
                      m0 = m->m_nextpkt;
                      m->m_nextpkt = 0;
                      if (error == 0) {
                              /* Record statistics for this interface address. */
                              if (ia != NULL) {
                                      counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
                                      counter_u64_add(ia->ia_ifa.ifa_obytes,
    8                                     m->m_pkthdr.len);
                              }
                              /*
                               * Reset layer specific mbuf flags
                               * to avoid confusing upper layers.
                               */
    8                         m_clrprotoflags(m);
      
    8                         IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
                                  mtod(m, struct ip *), NULL);
                              error = ip_output_send(inp, ifp, m, gw, ro, true);
                      } else
                              m_freem(m);
              }
      
    8         if (error == 0)
                      IPSTAT_INC(ips_fragmented);
      
      done:
              return (error);
       bad:
              m_freem(m);
              goto done;
      }
      
      /*
       * Create a chain of fragments which fit the given mtu. m_frag points to the
       * mbuf to be fragmented; on return it points to the chain with the fragments.
       * Return 0 if no error. If error, m_frag may contain a partially built
       * chain of fragments that should be freed by the caller.
       *
       * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
       */
      int
      ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
          u_long if_hwassist_flags)
    8 {
              int error = 0;
              int hlen = ip->ip_hl << 2;
              int len = (mtu - hlen) & ~7;        /* size of payload in each fragment */
              int off;
              struct mbuf *m0 = *m_frag;        /* the original packet                */
              int firstlen;
              struct mbuf **mnext;
              int nfrags;
              uint16_t ip_len, ip_off;
      
              ip_len = ntohs(ip->ip_len);
              ip_off = ntohs(ip->ip_off);
      
              if (ip_off & IP_DF) {        /* Fragmentation not allowed */
                      IPSTAT_INC(ips_cantfrag);
                      return EMSGSIZE;
              }
      
              /*
               * Must be able to put at least 8 bytes per fragment.
               */
              if (len < 8)
                      return EMSGSIZE;
      
              /*
               * If the interface will not calculate checksums on
               * fragmented packets, then do it here.
               */
    4         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
                      m0 = mb_unmapped_to_ext(m0);
                      if (m0 == NULL) {
                              error = ENOBUFS;
                              IPSTAT_INC(ips_odropped);
                              goto done;
                      }
    4                 in_delayed_cksum(m0);
                      m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
              }
      #ifdef SCTP
    8         if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
                      m0 = mb_unmapped_to_ext(m0);
                      if (m0 == NULL) {
                              error = ENOBUFS;
                              IPSTAT_INC(ips_odropped);
                              goto done;
                      }
                      sctp_delayed_cksum(m0, hlen);
                      m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
              }
      #endif
    8         if (len > PAGE_SIZE) {
                      /*
                       * Fragment large datagrams such that each segment
                       * contains a multiple of PAGE_SIZE amount of data,
                       * plus headers. This enables a receiver to perform
                       * page-flipping zero-copy optimizations.
                       *
                       * XXX When does this help given that sender and receiver
                       * could have different page sizes, and also mtu could
                       * be less than the receiver's page size ?
                       */
                      int newlen;
      
                      off = MIN(mtu, m0->m_pkthdr.len);
      
                      /*
                       * firstlen (off - hlen) must be aligned on an
                       * 8-byte boundary
                       */
                      if (off < hlen)
                              goto smart_frag_failure;
                      off = ((off - hlen) & ~7) + hlen;
                      newlen = (~PAGE_MASK) & mtu;
                      if ((newlen + sizeof (struct ip)) > mtu) {
                              /* we failed, go back the default */
      smart_frag_failure:
                              newlen = len;
                              off = hlen + len;
                      }
                      len = newlen;
      
              } else {
                      off = hlen + len;
              }
      
              firstlen = off - hlen;
              mnext = &m0->m_nextpkt;                /* pointer to next packet */
      
              /*
               * Loop through length of segment after first fragment,
               * make new header and copy data of each part and link onto chain.
               * Here, m0 is the original packet, m is the fragment being created.
               * The fragments are linked off the m_nextpkt of the original
               * packet, which after processing serves as the first fragment.
               */
    5         for (nfrags = 1; off < ip_len; off += len, nfrags++) {
                      struct ip *mhip;        /* ip header on the fragment */
                      struct mbuf *m;
                      int mhlen = sizeof (struct ip);
      
                      m = m_gethdr(M_NOWAIT, MT_DATA);
                      if (m == NULL) {
                              error = ENOBUFS;
                              IPSTAT_INC(ips_odropped);
                              goto done;
                      }
                      /*
                       * Make sure the complete packet header gets copied
                       * from the originating mbuf to the newly created
                       * mbuf. This also ensures that existing firewall
                       * classification(s), VLAN tags and so on get copied
                       * to the resulting fragmented packet(s):
                       */
                      if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
                              m_free(m);
                              error = ENOBUFS;
                              IPSTAT_INC(ips_odropped);
                              goto done;
                      }
                      /*
                       * In the first mbuf, leave room for the link header, then
                       * copy the original IP header including options. The payload
                       * goes into an additional mbuf chain returned by m_copym().
                       */
                      m->m_data += max_linkhdr;
    4                 mhip = mtod(m, struct ip *);
                      *mhip = *ip;
    4                 if (hlen > sizeof (struct ip)) {
                              mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
                              mhip->ip_v = IPVERSION;
                              mhip->ip_hl = mhlen >> 2;
                      }
                      m->m_len = mhlen;
                      /* XXX do we need to add ip_off below ? */
                      mhip->ip_off = ((off - hlen) >> 3) + ip_off;
                      if (off + len >= ip_len)
    8                         len = ip_len - off;
                      else
    5                         mhip->ip_off |= IP_MF;
                      mhip->ip_len = htons((u_short)(len + mhlen));
                      m->m_next = m_copym(m0, off, len, M_NOWAIT);
                      if (m->m_next == NULL) {        /* copy failed */
                              m_free(m);
                              error = ENOBUFS;        /* ??? */
                              IPSTAT_INC(ips_odropped);
                              goto done;
                      }
                      m->m_pkthdr.len = mhlen + len;
      #ifdef MAC
                      mac_netinet_fragment(m0, m);
      #endif
                      mhip->ip_off = htons(mhip->ip_off);
                      mhip->ip_sum = 0;
                      if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
    8                         mhip->ip_sum = in_cksum(m, mhlen);
                              m->m_pkthdr.csum_flags &= ~CSUM_IP;
                      }
                      *mnext = m;
                      mnext = &m->m_nextpkt;
              }
              IPSTAT_ADD(ips_ofragments, nfrags);
      
              /*
               * Update first fragment by trimming what's been copied out
               * and updating header.
               */
              m_adj(m0, hlen + firstlen - ip_len);
              m0->m_pkthdr.len = hlen + firstlen;
              ip->ip_len = htons((u_short)m0->m_pkthdr.len);
              ip->ip_off = htons(ip_off | IP_MF);
              ip->ip_sum = 0;
              if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
    8                 ip->ip_sum = in_cksum(m0, hlen);
                      m0->m_pkthdr.csum_flags &= ~CSUM_IP;
              }
      
      done:
              *m_frag = m0;
              return error;
      }
      
      void
      in_delayed_cksum(struct mbuf *m)
  101 {
              struct ip *ip;
              struct udphdr *uh;
              uint16_t cklen, csum, offset;
      
              ip = mtod(m, struct ip *);
              offset = ip->ip_hl << 2 ;
      
              if (m->m_pkthdr.csum_flags & CSUM_UDP) {
                      /* if udp header is not in the first mbuf copy udplen */
                      if (offset + sizeof(struct udphdr) > m->m_len) {
                              m_copydata(m, offset + offsetof(struct udphdr,
                                  uh_ulen), sizeof(cklen), (caddr_t)&cklen);
                              cklen = ntohs(cklen);
                      } else {
                              uh = (struct udphdr *)mtodo(m, offset);
                              cklen = ntohs(uh->uh_ulen);
                      }
                      csum = in_cksum_skip(m, cklen + offset, offset);
                      if (csum == 0)
                              csum = 0xffff;
              } else {
                      cklen = ntohs(ip->ip_len);
                      csum = in_cksum_skip(m, cklen, offset);
              }
              offset += m->m_pkthdr.csum_data;        /* checksum offset */
      
              if (offset + sizeof(csum) > m->m_len)
                      m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
              else
                      *(u_short *)mtodo(m, offset) = csum;
      }
      
      /*
       * IP socket option processing.
       */
      int
      ip_ctloutput(struct socket *so, struct sockopt *sopt)
  143 {
              struct inpcb *inp = sotoinpcb(so);
              int        error, optval;
      #ifdef        RSS
              uint32_t rss_bucket;
              int retval;
      #endif
      
              error = optval = 0;
    1         if (sopt->sopt_level != IPPROTO_IP) {
                      error = EINVAL;
      
                      if (sopt->sopt_level == SOL_SOCKET &&
                          sopt->sopt_dir == SOPT_SET) {
   15                         switch (sopt->sopt_name) {
                              case SO_REUSEADDR:
                                      INP_WLOCK(inp);
                                      if ((so->so_options & SO_REUSEADDR) != 0)
                                              inp->inp_flags2 |= INP_REUSEADDR;
                                      else
                                              inp->inp_flags2 &= ~INP_REUSEADDR;
                                      INP_WUNLOCK(inp);
                                      error = 0;
                                      break;
                              case SO_REUSEPORT:
    1                                 INP_WLOCK(inp);
                                      if ((so->so_options & SO_REUSEPORT) != 0)
                                              inp->inp_flags2 |= INP_REUSEPORT;
                                      else
                                              inp->inp_flags2 &= ~INP_REUSEPORT;
                                      INP_WUNLOCK(inp);
                                      error = 0;
                                      break;
                              case SO_REUSEPORT_LB:
    1                                 INP_WLOCK(inp);
                                      if ((so->so_options & SO_REUSEPORT_LB) != 0)
                                              inp->inp_flags2 |= INP_REUSEPORT_LB;
                                      else
                                              inp->inp_flags2 &= ~INP_REUSEPORT_LB;
                                      INP_WUNLOCK(inp);
                                      error = 0;
                                      break;
                              case SO_SETFIB:
                                      INP_WLOCK(inp);
                                      inp->inp_inc.inc_fibnum = so->so_fibnum;
                                      INP_WUNLOCK(inp);
                                      error = 0;
                                      break;
                              case SO_MAX_PACING_RATE:
      #ifdef RATELIMIT
                                      INP_WLOCK(inp);
                                      inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
                                      INP_WUNLOCK(inp);
                                      error = 0;
      #else
                                      error = EOPNOTSUPP;
      #endif
                                      break;
                              default:
                                      break;
                              }
                      }
                      return (error);
              }
      
              switch (sopt->sopt_dir) {
              case SOPT_SET:
   88                 switch (sopt->sopt_name) {
                      case IP_OPTIONS:
      #ifdef notyet
                      case IP_RETOPTS:
      #endif
                      {
                              struct mbuf *m;
                              if (sopt->sopt_valsize > MLEN) {
                                      error = EMSGSIZE;
                                      break;
                              }
                              m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
                              if (m == NULL) {
                                      error = ENOBUFS;
                                      break;
                              }
                              m->m_len = sopt->sopt_valsize;
                              error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
                                                  m->m_len);
                              if (error) {
                                      m_free(m);
                                      break;
                              }
   13                         INP_WLOCK(inp);
                              error = ip_pcbopts(inp, sopt->sopt_name, m);
                              INP_WUNLOCK(inp);
                              return (error);
                      }
      
                      case IP_BINDANY:
                              if (sopt->sopt_td != NULL) {
                                      error = priv_check(sopt->sopt_td,
                                          PRIV_NETINET_BINDANY);
    2                                 if (error)
                                              break;
                              }
                              /* FALLTHROUGH */
                      case IP_BINDMULTI:
      #ifdef        RSS
                      case IP_RSS_LISTEN_BUCKET:
      #endif
                      case IP_TOS:
                      case IP_TTL:
                      case IP_MINTTL:
                      case IP_RECVOPTS:
                      case IP_RECVRETOPTS:
                      case IP_ORIGDSTADDR:
                      case IP_RECVDSTADDR:
                      case IP_RECVTTL:
                      case IP_RECVIF:
                      case IP_ONESBCAST:
                      case IP_DONTFRAG:
                      case IP_RECVTOS:
                      case IP_RECVFLOWID:
      #ifdef        RSS
                      case IP_RECVRSSBUCKETID:
      #endif
                              error = sooptcopyin(sopt, &optval, sizeof optval,
                                                  sizeof optval);
                              if (error)
                                      break;
      
                              switch (sopt->sopt_name) {
                              case IP_TOS:
    1                                 inp->inp_ip_tos = optval;
                                      break;
      
                              case IP_TTL:
    1                                 inp->inp_ip_ttl = optval;
                                      break;
      
                              case IP_MINTTL:
                                      if (optval >= 0 && optval <= MAXTTL)
    1                                         inp->inp_ip_minttl = optval;
                                      else
                                              error = EINVAL;
                                      break;
      
      #define        OPTSET(bit) do {                                                \
              INP_WLOCK(inp);                                                        \
              if (optval)                                                        \
                      inp->inp_flags |= bit;                                        \
              else                                                                \
                      inp->inp_flags &= ~bit;                                        \
              INP_WUNLOCK(inp);                                                \
      } while (0)
      
      #define        OPTSET2(bit, val) do {                                                \
              INP_WLOCK(inp);                                                        \
              if (val)                                                        \
                      inp->inp_flags2 |= bit;                                        \
              else                                                                \
                      inp->inp_flags2 &= ~bit;                                \
              INP_WUNLOCK(inp);                                                \
      } while (0)
      
                              case IP_RECVOPTS:
    1                                 OPTSET(INP_RECVOPTS);
                                      break;
      
                              case IP_RECVRETOPTS:
    1                                 OPTSET(INP_RECVRETOPTS);
                                      break;
      
                              case IP_RECVDSTADDR:
    1                                 OPTSET(INP_RECVDSTADDR);
                                      break;
      
                              case IP_ORIGDSTADDR:
                                      OPTSET2(INP_ORIGDSTADDR, optval);
                                      break;
      
                              case IP_RECVTTL:
    1                                 OPTSET(INP_RECVTTL);
                                      break;
      
                              case IP_RECVIF:
    1                                 OPTSET(INP_RECVIF);
                                      break;
      
                              case IP_ONESBCAST:
                                      OPTSET(INP_ONESBCAST);
                                      break;
                              case IP_DONTFRAG:
    2                                 OPTSET(INP_DONTFRAG);
                                      break;
                              case IP_BINDANY:
    2                                 OPTSET(INP_BINDANY);
                                      break;
                              case IP_RECVTOS:
    1                                 OPTSET(INP_RECVTOS);
                                      break;
                              case IP_BINDMULTI:
    1                                 OPTSET2(INP_BINDMULTI, optval);
                                      break;
                              case IP_RECVFLOWID:
    1                                 OPTSET2(INP_RECVFLOWID, optval);
                                      break;
      #ifdef        RSS
                              case IP_RSS_LISTEN_BUCKET:
                                      if ((optval >= 0) &&
                                          (optval < rss_getnumbuckets())) {
                                              inp->inp_rss_listen_bucket = optval;
                                              OPTSET2(INP_RSS_BUCKET_SET, 1);
                                      } else {
                                              error = EINVAL;
                                      }
                                      break;
                              case IP_RECVRSSBUCKETID:
                                      OPTSET2(INP_RECVRSSBUCKETID, optval);
                                      break;
      #endif
                              }
                              break;
      #undef OPTSET
      #undef OPTSET2
      
                      /*
                       * Multicast socket options are processed by the in_mcast
                       * module.
                       */
                      case IP_MULTICAST_IF:
                      case IP_MULTICAST_VIF:
                      case IP_MULTICAST_TTL:
                      case IP_MULTICAST_LOOP:
                      case IP_ADD_MEMBERSHIP:
                      case IP_DROP_MEMBERSHIP:
                      case IP_ADD_SOURCE_MEMBERSHIP:
                      case IP_DROP_SOURCE_MEMBERSHIP:
                      case IP_BLOCK_SOURCE:
                      case IP_UNBLOCK_SOURCE:
                      case IP_MSFILTER:
                      case MCAST_JOIN_GROUP:
                      case MCAST_LEAVE_GROUP:
                      case MCAST_JOIN_SOURCE_GROUP:
                      case MCAST_LEAVE_SOURCE_GROUP:
                      case MCAST_BLOCK_SOURCE:
                      case MCAST_UNBLOCK_SOURCE:
                              error = inp_setmoptions(inp, sopt);
                              break;
      
                      case IP_PORTRANGE:
                              error = sooptcopyin(sopt, &optval, sizeof optval,
                                                  sizeof optval);
    1                         if (error)
                                      break;
      
                              INP_WLOCK(inp);
    1                         switch (optval) {
                              case IP_PORTRANGE_DEFAULT:
                                      inp->inp_flags &= ~(INP_LOWPORT);
                                      inp->inp_flags &= ~(INP_HIGHPORT);
                                      break;
      
                              case IP_PORTRANGE_HIGH:
    1                                 inp->inp_flags &= ~(INP_LOWPORT);
                                      inp->inp_flags |= INP_HIGHPORT;
                                      break;
      
                              case IP_PORTRANGE_LOW:
    1                                 inp->inp_flags &= ~(INP_HIGHPORT);
                                      inp->inp_flags |= INP_LOWPORT;
                                      break;
      
                              default:
                                      error = EINVAL;
                                      break;
                              }
                              INP_WUNLOCK(inp);
                              break;
      
      #if defined(IPSEC) || defined(IPSEC_SUPPORT)
                      case IP_IPSEC_POLICY:
    1                         if (IPSEC_ENABLED(ipv4)) {
                                      error = IPSEC_PCBCTL(ipv4, inp, sopt);
                                      break;
                              }
                              /* FALLTHROUGH */
      #endif /* IPSEC */
      
                      default:
                              error = ENOPROTOOPT;
                              break;
                      }
                      break;
      
              case SOPT_GET:
   17                 switch (sopt->sopt_name) {
                      case IP_OPTIONS:
                      case IP_RETOPTS:
                              INP_RLOCK(inp);
                              if (inp->inp_options) {
                                      struct mbuf *options;
      
                                      options = m_copym(inp->inp_options, 0,
                                          M_COPYALL, M_NOWAIT);
                                      INP_RUNLOCK(inp);
                                      if (options != NULL) {
                                              error = sooptcopyout(sopt,
    2                                                              mtod(options, char *),
                                                                   options->m_len);
                                              m_freem(options);
                                      } else
                                              error = ENOMEM;
                              } else {
                                      INP_RUNLOCK(inp);
                                      sopt->sopt_valsize = 0;
                              }
                              break;
      
                      case IP_TOS:
                      case IP_TTL:
                      case IP_MINTTL:
                      case IP_RECVOPTS:
                      case IP_RECVRETOPTS:
                      case IP_ORIGDSTADDR:
                      case IP_RECVDSTADDR:
                      case IP_RECVTTL:
                      case IP_RECVIF:
                      case IP_PORTRANGE:
                      case IP_ONESBCAST:
                      case IP_DONTFRAG:
                      case IP_BINDANY:
                      case IP_RECVTOS:
                      case IP_BINDMULTI:
                      case IP_FLOWID:
                      case IP_FLOWTYPE:
                      case IP_RECVFLOWID:
      #ifdef        RSS
                      case IP_RSSBUCKETID:
                      case IP_RECVRSSBUCKETID:
      #endif
                              switch (sopt->sopt_name) {
      
                              case IP_TOS:
    1                                 optval = inp->inp_ip_tos;
                                      break;
      
                              case IP_TTL:
                                      optval = inp->inp_ip_ttl;
                                      break;
      
                              case IP_MINTTL:
    1                                 optval = inp->inp_ip_minttl;
                                      break;
      
      #define        OPTBIT(bit)        (inp->inp_flags & bit ? 1 : 0)
      #define        OPTBIT2(bit)        (inp->inp_flags2 & bit ? 1 : 0)
      
                              case IP_RECVOPTS:
                                      optval = OPTBIT(INP_RECVOPTS);
                                      break;
      
                              case IP_RECVRETOPTS:
    1                                 optval = OPTBIT(INP_RECVRETOPTS);
                                      break;
      
                              case IP_RECVDSTADDR:
                                      optval = OPTBIT(INP_RECVDSTADDR);
                                      break;
      
                              case IP_ORIGDSTADDR:
    1                                 optval = OPTBIT2(INP_ORIGDSTADDR);
                                      break;
      
                              case IP_RECVTTL:
    2                                 optval = OPTBIT(INP_RECVTTL);
                                      break;
      
                              case IP_RECVIF:
                                      optval = OPTBIT(INP_RECVIF);
                                      break;
      
                              case IP_PORTRANGE:
                                      if (inp->inp_flags & INP_HIGHPORT)
                                              optval = IP_PORTRANGE_HIGH;
                                      else if (inp->inp_flags & INP_LOWPORT)
                                              optval = IP_PORTRANGE_LOW;
                                      else
                                              optval = 0;
                                      break;
      
                              case IP_ONESBCAST:
                                      optval = OPTBIT(INP_ONESBCAST);
                                      break;
                              case IP_DONTFRAG:
    1                                 optval = OPTBIT(INP_DONTFRAG);
                                      break;
                              case IP_BINDANY:
    1                                 optval = OPTBIT(INP_BINDANY);
                                      break;
                              case IP_RECVTOS:
                                      optval = OPTBIT(INP_RECVTOS);
                                      break;
                              case IP_FLOWID:
                                      optval = inp->inp_flowid;
                                      break;
                              case IP_FLOWTYPE:
                                      optval = inp->inp_flowtype;
                                      break;
                              case IP_RECVFLOWID:
                                      optval = OPTBIT2(INP_RECVFLOWID);
                                      break;
      #ifdef        RSS
                              case IP_RSSBUCKETID:
                                      retval = rss_hash2bucket(inp->inp_flowid,
                                          inp->inp_flowtype,
                                          &rss_bucket);
                                      if (retval == 0)
                                              optval = rss_bucket;
                                      else
                                              error = EINVAL;
                                      break;
                              case IP_RECVRSSBUCKETID:
                                      optval = OPTBIT2(INP_RECVRSSBUCKETID);
                                      break;
      #endif
                              case IP_BINDMULTI:
                                      optval = OPTBIT2(INP_BINDMULTI);
                                      break;
                              }
                              error = sooptcopyout(sopt, &optval, sizeof optval);
                              break;
      
                      /*
                       * Multicast socket options are processed by the in_mcast
                       * module.
                       */
                      case IP_MULTICAST_IF:
                      case IP_MULTICAST_VIF:
                      case IP_MULTICAST_TTL:
                      case IP_MULTICAST_LOOP:
                      case IP_MSFILTER:
                              error = inp_getmoptions(inp, sopt);
                              break;
      
      #if defined(IPSEC) || defined(IPSEC_SUPPORT)
                      case IP_IPSEC_POLICY:
                              if (IPSEC_ENABLED(ipv4)) {
                                      error = IPSEC_PCBCTL(ipv4, inp, sopt);
                                      break;
                              }
                              /* FALLTHROUGH */
      #endif /* IPSEC */
      
                      default:
                              error = ENOPROTOOPT;
                              break;
                      }
                      break;
              }
              return (error);
      }
      
      /*
       * Routine called from ip_output() to loop back a copy of an IP multicast
       * packet to the input queue of a specified interface.  Note that this
       * calls the output routine of the loopback "driver", but with an interface
       * pointer that might NOT be a loopback interface -- evil, but easier than
       * replicating that code here.
       */
      static void
      ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
      {
              struct ip *ip;
              struct mbuf *copym;
      
              /*
               * Make a deep copy of the packet because we're going to
               * modify the pack in order to generate checksums.
               */
              copym = m_dup(m, M_NOWAIT);
    9         if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
                      copym = m_pullup(copym, hlen);
              if (copym != NULL) {
                      /* If needed, compute the checksum and mark it as valid. */
    4                 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
    5                         in_delayed_cksum(copym);
                              copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
                              copym->m_pkthdr.csum_flags |=
                                  CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
                              copym->m_pkthdr.csum_data = 0xffff;
                      }
                      /*
                       * We don't bother to fragment if the IP length is greater
                       * than the interface's MTU.  Can this possibly matter?
                       */
                      ip = mtod(copym, struct ip *);
                      ip->ip_sum = 0;
                      ip->ip_sum = in_cksum(copym, hlen);
                      if_simloop(ifp, copym, AF_INET, 0);
              }
      }
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 1999 Poul-Henning Kamp.
       * Copyright (c) 2008 Bjoern A. Zeeb.
       * Copyright (c) 2009 James Gritton.
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_ddb.h"
      #include "opt_inet.h"
      #include "opt_inet6.h"
      
      #include <sys/param.h>
      #include <sys/types.h>
      #include <sys/kernel.h>
      #include <sys/systm.h>
      #include <sys/errno.h>
      #include <sys/sysproto.h>
      #include <sys/malloc.h>
      #include <sys/osd.h>
      #include <sys/priv.h>
      #include <sys/proc.h>
      #include <sys/taskqueue.h>
      #include <sys/fcntl.h>
      #include <sys/jail.h>
      #include <sys/lock.h>
      #include <sys/mutex.h>
      #include <sys/racct.h>
      #include <sys/rctl.h>
      #include <sys/refcount.h>
      #include <sys/sx.h>
      #include <sys/sysent.h>
      #include <sys/namei.h>
      #include <sys/mount.h>
      #include <sys/queue.h>
      #include <sys/socket.h>
      #include <sys/syscallsubr.h>
      #include <sys/sysctl.h>
      #include <sys/vnode.h>
      
      #include <net/if.h>
      #include <net/vnet.h>
      
      #include <netinet/in.h>
      
      #ifdef DDB
      #include <ddb/ddb.h>
      #endif /* DDB */
      
      #include <security/mac/mac_framework.h>
      
      #define        DEFAULT_HOSTUUID        "00000000-0000-0000-0000-000000000000"
      
      MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
      static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
      
      /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
      #ifdef INET
      #ifdef INET6
      #define        _PR_IP_SADDRSEL        PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
      #else
      #define        _PR_IP_SADDRSEL        PR_IP4_SADDRSEL
      #endif
      #else /* !INET */
      #ifdef INET6
      #define        _PR_IP_SADDRSEL        PR_IP6_SADDRSEL
      #else
      #define        _PR_IP_SADDRSEL        0
      #endif
      #endif
      
      /* prison0 describes what is "real" about the system. */
      struct prison prison0 = {
              .pr_id                = 0,
              .pr_name        = "0",
              .pr_ref                = 1,
              .pr_uref        = 1,
              .pr_path        = "/",
              .pr_securelevel        = -1,
              .pr_devfs_rsnum = 0,
              .pr_childmax        = JAIL_MAX,
              .pr_hostuuid        = DEFAULT_HOSTUUID,
              .pr_children        = LIST_HEAD_INITIALIZER(prison0.pr_children),
      #ifdef VIMAGE
              .pr_flags        = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
      #else
              .pr_flags        = PR_HOST|_PR_IP_SADDRSEL,
      #endif
              .pr_allow        = PR_ALLOW_ALL_STATIC,
      };
      MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
      
      struct bool_flags {
              const char        *name;
              const char        *noname;
              unsigned         flag;
      };
      struct jailsys_flags {
              const char        *name;
              unsigned         disable;
              unsigned         new;
      };
      
      /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
      struct        sx allprison_lock;
      SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
      struct        prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
      LIST_HEAD(, prison_racct) allprison_racct;
      int        lastprid = 0;
      
      static int do_jail_attach(struct thread *td, struct prison *pr);
      static void prison_complete(void *context, int pending);
      static void prison_deref(struct prison *pr, int flags);
      static char *prison_path(struct prison *pr1, struct prison *pr2);
      static void prison_remove_one(struct prison *pr);
      #ifdef RACCT
      static void prison_racct_attach(struct prison *pr);
      static void prison_racct_modify(struct prison *pr);
      static void prison_racct_detach(struct prison *pr);
      #endif
      
      /* Flags for prison_deref */
      #define        PD_DEREF        0x01
      #define        PD_DEUREF        0x02
      #define        PD_LOCKED        0x04
      #define        PD_LIST_SLOCKED        0x08
      #define        PD_LIST_XLOCKED        0x10
      
      /*
       * Parameter names corresponding to PR_* flag values.  Size values are for kvm
       * as we cannot figure out the size of a sparse array, or an array without a
       * terminating entry.
       */
      static struct bool_flags pr_flag_bool[] = {
              {"persist", "nopersist", PR_PERSIST},
      #ifdef INET
              {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
      #endif
      #ifdef INET6
              {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
      #endif
      };
      const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
      
      static struct jailsys_flags pr_flag_jailsys[] = {
              {"host", 0, PR_HOST},
      #ifdef VIMAGE
              {"vnet", 0, PR_VNET},
      #endif
      #ifdef INET
              {"ip4", PR_IP4_USER, PR_IP4_USER},
      #endif
      #ifdef INET6
              {"ip6", PR_IP6_USER, PR_IP6_USER},
      #endif
      };
      const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
      
      /* Make this array full-size so dynamic parameters can be added. */
      static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
              {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
              {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
              {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
              {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
              {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
              {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
              {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
              {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
              {"allow.reserved_ports", "allow.noreserved_ports",
               PR_ALLOW_RESERVED_PORTS},
              {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
              {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
               PR_ALLOW_UNPRIV_DEBUG},
      };
      const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
      
      #define        JAIL_DEFAULT_ALLOW                (PR_ALLOW_SET_HOSTNAME | \
                                               PR_ALLOW_RESERVED_PORTS | \
                                               PR_ALLOW_UNPRIV_DEBUG)
      #define        JAIL_DEFAULT_ENFORCE_STATFS        2
      #define        JAIL_DEFAULT_DEVFS_RSNUM        0
      static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
      static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
      static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
      #if defined(INET) || defined(INET6)
      static unsigned jail_max_af_ips = 255;
      #endif
      
      /*
       * Initialize the parts of prison0 that can't be static-initialized with
       * constants.  This is called from proc0_init() after creating thread0 cpuset.
       */
      void
      prison0_init(void)
      {
      
              prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
              prison0.pr_osreldate = osreldate;
              strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
      }
      
      /*
       * struct jail_args {
       *        struct jail *jail;
       * };
       */
      int
      sys_jail(struct thread *td, struct jail_args *uap)
      {
              uint32_t version;
              int error;
              struct jail j;
      
              error = copyin(uap->jail, &version, sizeof(uint32_t));
              if (error)
                      return (error);
      
              switch (version) {
              case 0:
              {
                      struct jail_v0 j0;
      
                      /* FreeBSD single IPv4 jails. */
                      bzero(&j, sizeof(struct jail));
                      error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
                      if (error)
                              return (error);
                      j.version = j0.version;
                      j.path = j0.path;
                      j.hostname = j0.hostname;
                      j.ip4s = htonl(j0.ip_number);        /* jail_v0 is host order */
                      break;
              }
      
              case 1:
                      /*
                       * Version 1 was used by multi-IPv4 jail implementations
                       * that never made it into the official kernel.
                       */
                      return (EINVAL);
      
              case 2:        /* JAIL_API_VERSION */
                      /* FreeBSD multi-IPv4/IPv6,noIP jails. */
                      error = copyin(uap->jail, &j, sizeof(struct jail));
                      if (error)
                              return (error);
                      break;
      
              default:
                      /* Sci-Fi jails are not supported, sorry. */
                      return (EINVAL);
              }
              return (kern_jail(td, &j));
      }
      
      int
      kern_jail(struct thread *td, struct jail *j)
      {
              struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
      #ifdef INET
                                  + 1
      #endif
      #ifdef INET6
                                  + 1
      #endif
                                  )];
              struct uio opt;
              char *u_path, *u_hostname, *u_name;
              struct bool_flags *bf;
      #ifdef INET
              uint32_t ip4s;
              struct in_addr *u_ip4;
      #endif
      #ifdef INET6
              struct in6_addr *u_ip6;
      #endif
              size_t tmplen;
              int error, enforce_statfs;
      
              bzero(&optiov, sizeof(optiov));
              opt.uio_iov = optiov;
              opt.uio_iovcnt = 0;
              opt.uio_offset = -1;
              opt.uio_resid = -1;
              opt.uio_segflg = UIO_SYSSPACE;
              opt.uio_rw = UIO_READ;
              opt.uio_td = td;
      
              /* Set permissions for top-level jails from sysctls. */
              if (!jailed(td->td_ucred)) {
                      for (bf = pr_flag_allow;
                           bf < pr_flag_allow + nitems(pr_flag_allow) &&
                              bf->flag != 0;
                           bf++) {
                              optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
                                  (jail_default_allow & bf->flag)
                                  ? bf->name : bf->noname);
                              optiov[opt.uio_iovcnt].iov_len =
                                  strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
                              opt.uio_iovcnt += 2;
                      }
                      optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
                      optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
                      opt.uio_iovcnt++;
                      enforce_statfs = jail_default_enforce_statfs;
                      optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
                      optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
                      opt.uio_iovcnt++;
              }
      
              tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
      #ifdef INET
              ip4s = (j->version == 0) ? 1 : j->ip4s;
              if (ip4s > jail_max_af_ips)
                      return (EINVAL);
              tmplen += ip4s * sizeof(struct in_addr);
      #else
              if (j->ip4s > 0)
                      return (EINVAL);
      #endif
      #ifdef INET6
              if (j->ip6s > jail_max_af_ips)
                      return (EINVAL);
              tmplen += j->ip6s * sizeof(struct in6_addr);
      #else
              if (j->ip6s > 0)
                      return (EINVAL);
      #endif
              u_path = malloc(tmplen, M_TEMP, M_WAITOK);
              u_hostname = u_path + MAXPATHLEN;
              u_name = u_hostname + MAXHOSTNAMELEN;
      #ifdef INET
              u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
      #endif
      #ifdef INET6
      #ifdef INET
              u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
      #else
              u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
      #endif
      #endif
              optiov[opt.uio_iovcnt].iov_base = "path";
              optiov[opt.uio_iovcnt].iov_len = sizeof("path");
              opt.uio_iovcnt++;
              optiov[opt.uio_iovcnt].iov_base = u_path;
              error = copyinstr(j->path, u_path, MAXPATHLEN,
                  &optiov[opt.uio_iovcnt].iov_len);
              if (error) {
                      free(u_path, M_TEMP);
                      return (error);
              }
              opt.uio_iovcnt++;
              optiov[opt.uio_iovcnt].iov_base = "host.hostname";
              optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
              opt.uio_iovcnt++;
              optiov[opt.uio_iovcnt].iov_base = u_hostname;
              error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
                  &optiov[opt.uio_iovcnt].iov_len);
              if (error) {
                      free(u_path, M_TEMP);
                      return (error);
              }
              opt.uio_iovcnt++;
              if (j->jailname != NULL) {
                      optiov[opt.uio_iovcnt].iov_base = "name";
                      optiov[opt.uio_iovcnt].iov_len = sizeof("name");
                      opt.uio_iovcnt++;
                      optiov[opt.uio_iovcnt].iov_base = u_name;
                      error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
                          &optiov[opt.uio_iovcnt].iov_len);
                      if (error) {
                              free(u_path, M_TEMP);
                              return (error);
                      }
                      opt.uio_iovcnt++;
              }
      #ifdef INET
              optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
              optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
              opt.uio_iovcnt++;
              optiov[opt.uio_iovcnt].iov_base = u_ip4;
              optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
              if (j->version == 0)
                      u_ip4->s_addr = j->ip4s;
              else {
                      error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
                      if (error) {
                              free(u_path, M_TEMP);
                              return (error);
                      }
              }
              opt.uio_iovcnt++;
      #endif
      #ifdef INET6
              optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
              optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
              opt.uio_iovcnt++;
              optiov[opt.uio_iovcnt].iov_base = u_ip6;
              optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
              error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
              if (error) {
                      free(u_path, M_TEMP);
                      return (error);
              }
              opt.uio_iovcnt++;
      #endif
              KASSERT(opt.uio_iovcnt <= nitems(optiov),
                      ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
              error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
              free(u_path, M_TEMP);
              return (error);
      }
      
      /*
       * struct jail_set_args {
       *        struct iovec *iovp;
       *        unsigned int iovcnt;
       *        int flags;
       * };
       */
      int
      sys_jail_set(struct thread *td, struct jail_set_args *uap)
      {
              struct uio *auio;
              int error;
      
              /* Check that we have an even number of iovecs. */
              if (uap->iovcnt & 1)
                      return (EINVAL);
      
              error = copyinuio(uap->iovp, uap->iovcnt, &auio);
              if (error)
                      return (error);
              error = kern_jail_set(td, auio, uap->flags);
              free(auio, M_IOV);
              return (error);
      }
      
      int
      kern_jail_set(struct thread *td, struct uio *optuio, int flags)
      {
              struct nameidata nd;
      #ifdef INET
              struct in_addr *ip4;
      #endif
      #ifdef INET6
              struct in6_addr *ip6;
      #endif
              struct vfsopt *opt;
              struct vfsoptlist *opts;
              struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
              struct vnode *root;
              char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
              char *g_path, *osrelstr;
              struct bool_flags *bf;
              struct jailsys_flags *jsf;
      #if defined(INET) || defined(INET6)
              struct prison *tppr;
              void *op;
      #endif
              unsigned long hid;
              size_t namelen, onamelen, pnamelen;
              int born, created, cuflags, descend, enforce;
              int error, errmsg_len, errmsg_pos;
              int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
              int jid, jsys, len, level;
              int childmax, osreldt, rsnum, slevel;
      #if defined(INET) || defined(INET6)
              int ii, ij;
      #endif
      #ifdef INET
              int ip4s, redo_ip4;
      #endif
      #ifdef INET6
              int ip6s, redo_ip6;
      #endif
              uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
              uint64_t pr_allow_diff;
              unsigned tallow;
              char numbuf[12];
      
              error = priv_check(td, PRIV_JAIL_SET);
              if (!error && (flags & JAIL_ATTACH))
                      error = priv_check(td, PRIV_JAIL_ATTACH);
              if (error)
                      return (error);
              mypr = td->td_ucred->cr_prison;
              if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
                      return (EPERM);
              if (flags & ~JAIL_SET_MASK)
                      return (EINVAL);
      
              /*
               * Check all the parameters before committing to anything.  Not all
               * errors can be caught early, but we may as well try.  Also, this
               * takes care of some expensive stuff (path lookup) before getting
               * the allprison lock.
               *
               * XXX Jails are not filesystems, and jail parameters are not mount
               *     options.  But it makes more sense to re-use the vfsopt code
               *     than duplicate it under a different name.
               */
              error = vfs_buildopts(optuio, &opts);
              if (error)
                      return (error);
      #ifdef INET
              ip4 = NULL;
      #endif
      #ifdef INET6
              ip6 = NULL;
      #endif
              g_path = NULL;
      
              cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
              if (!cuflags) {
                      error = EINVAL;
                      vfs_opterror(opts, "no valid operation (create or update)");
                      goto done_errmsg;
              }
      
              error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
              if (error == ENOENT)
                      jid = 0;
              else if (error != 0)
                      goto done_free;
      
              error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
              if (error == ENOENT)
                      gotslevel = 0;
              else if (error != 0)
                      goto done_free;
              else
                      gotslevel = 1;
      
              error =
                  vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
              if (error == ENOENT)
                      gotchildmax = 0;
              else if (error != 0)
                      goto done_free;
              else
                      gotchildmax = 1;
      
              error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
              if (error == ENOENT)
                      gotenforce = 0;
              else if (error != 0)
                      goto done_free;
              else if (enforce < 0 || enforce > 2) {
                      error = EINVAL;
                      goto done_free;
              } else
                      gotenforce = 1;
      
              error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
              if (error == ENOENT)
                      gotrsnum = 0;
              else if (error != 0)
                      goto done_free;
              else
                      gotrsnum = 1;
      
              pr_flags = ch_flags = 0;
              for (bf = pr_flag_bool;
                   bf < pr_flag_bool + nitems(pr_flag_bool);
                   bf++) {
                      vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
                      vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
              }
              ch_flags |= pr_flags;
              for (jsf = pr_flag_jailsys;
                   jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
                   jsf++) {
                      error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
                      if (error == ENOENT)
                              continue;
                      if (error != 0)
                              goto done_free;
                      switch (jsys) {
                      case JAIL_SYS_DISABLE:
                              if (!jsf->disable) {
                                      error = EINVAL;
                                      goto done_free;
                              }
                              pr_flags |= jsf->disable;
                              break;
                      case JAIL_SYS_NEW:
                              pr_flags |= jsf->new;
                              break;
                      case JAIL_SYS_INHERIT:
                              break;
                      default:
                              error = EINVAL;
                              goto done_free;
                      }
                      ch_flags |= jsf->new | jsf->disable;
              }
              if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
                  && !(pr_flags & PR_PERSIST)) {
                      error = EINVAL;
                      vfs_opterror(opts, "new jail must persist or attach");
                      goto done_errmsg;
              }
      #ifdef VIMAGE
              if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
                      error = EINVAL;
                      vfs_opterror(opts, "vnet cannot be changed after creation");
                      goto done_errmsg;
              }
      #endif
      #ifdef INET
              if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
                      error = EINVAL;
                      vfs_opterror(opts, "ip4 cannot be changed after creation");
                      goto done_errmsg;
              }
      #endif
      #ifdef INET6
              if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
                      error = EINVAL;
                      vfs_opterror(opts, "ip6 cannot be changed after creation");
                      goto done_errmsg;
              }
      #endif
      
              pr_allow = ch_allow = 0;
              for (bf = pr_flag_allow;
                   bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
                   bf++) {
                      vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
                      vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
              }
              ch_allow |= pr_allow;
      
              error = vfs_getopt(opts, "name", (void **)&name, &len);
              if (error == ENOENT)
                      name = NULL;
              else if (error != 0)
                      goto done_free;
              else {
                      if (len == 0 || name[len - 1] != '\0') {
                              error = EINVAL;
                              goto done_free;
                      }
                      if (len > MAXHOSTNAMELEN) {
                              error = ENAMETOOLONG;
                              goto done_free;
                      }
              }
      
              error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
              if (error == ENOENT)
                      host = NULL;
              else if (error != 0)
                      goto done_free;
              else {
                      ch_flags |= PR_HOST;
                      pr_flags |= PR_HOST;
                      if (len == 0 || host[len - 1] != '\0') {
                              error = EINVAL;
                              goto done_free;
                      }
                      if (len > MAXHOSTNAMELEN) {
                              error = ENAMETOOLONG;
                              goto done_free;
                      }
              }
      
              error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
              if (error == ENOENT)
                      domain = NULL;
              else if (error != 0)
                      goto done_free;
              else {
                      ch_flags |= PR_HOST;
                      pr_flags |= PR_HOST;
                      if (len == 0 || domain[len - 1] != '\0') {
                              error = EINVAL;
                              goto done_free;
                      }
                      if (len > MAXHOSTNAMELEN) {
                              error = ENAMETOOLONG;
                              goto done_free;
                      }
              }
      
              error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
              if (error == ENOENT)
                      uuid = NULL;
              else if (error != 0)
                      goto done_free;
              else {
                      ch_flags |= PR_HOST;
                      pr_flags |= PR_HOST;
                      if (len == 0 || uuid[len - 1] != '\0') {
                              error = EINVAL;
                              goto done_free;
                      }
                      if (len > HOSTUUIDLEN) {
                              error = ENAMETOOLONG;
                              goto done_free;
                      }
              }
      
      #ifdef COMPAT_FREEBSD32
              if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
                      uint32_t hid32;
      
                      error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
                      hid = hid32;
              } else
      #endif
                      error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
              if (error == ENOENT)
                      gothid = 0;
              else if (error != 0)
                      goto done_free;
              else {
                      gothid = 1;
                      ch_flags |= PR_HOST;
                      pr_flags |= PR_HOST;
              }
      
      #ifdef INET
              error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
              if (error == ENOENT)
                      ip4s = 0;
              else if (error != 0)
                      goto done_free;
              else if (ip4s & (sizeof(*ip4) - 1)) {
                      error = EINVAL;
                      goto done_free;
              } else {
                      ch_flags |= PR_IP4_USER;
                      pr_flags |= PR_IP4_USER;
                      if (ip4s > 0) {
                              ip4s /= sizeof(*ip4);
                              if (ip4s > jail_max_af_ips) {
                                      error = EINVAL;
                                      vfs_opterror(opts, "too many IPv4 addresses");
                                      goto done_errmsg;
                              }
                              ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
                              bcopy(op, ip4, ip4s * sizeof(*ip4));
                              /*
                               * IP addresses are all sorted but ip[0] to preserve
                               * the primary IP address as given from userland.
                               * This special IP is used for unbound outgoing
                               * connections as well for "loopback" traffic in case
                               * source address selection cannot find any more fitting
                               * address to connect from.
                               */
                              if (ip4s > 1)
                                      qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
                                          prison_qcmp_v4);
                              /*
                               * Check for duplicate addresses and do some simple
                               * zero and broadcast checks. If users give other bogus
                               * addresses it is their problem.
                               *
                               * We do not have to care about byte order for these
                               * checks so we will do them in NBO.
                               */
                              for (ii = 0; ii < ip4s; ii++) {
                                      if (ip4[ii].s_addr == INADDR_ANY ||
                                          ip4[ii].s_addr == INADDR_BROADCAST) {
                                              error = EINVAL;
                                              goto done_free;
                                      }
                                      if ((ii+1) < ip4s &&
                                          (ip4[0].s_addr == ip4[ii+1].s_addr ||
                                           ip4[ii].s_addr == ip4[ii+1].s_addr)) {
                                              error = EINVAL;
                                              goto done_free;
                                      }
                              }
                      }
              }
      #endif
      
      #ifdef INET6
              error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
              if (error == ENOENT)
                      ip6s = 0;
              else if (error != 0)
                      goto done_free;
              else if (ip6s & (sizeof(*ip6) - 1)) {
                      error = EINVAL;
                      goto done_free;
              } else {
                      ch_flags |= PR_IP6_USER;
                      pr_flags |= PR_IP6_USER;
                      if (ip6s > 0) {
                              ip6s /= sizeof(*ip6);
                              if (ip6s > jail_max_af_ips) {
                                      error = EINVAL;
                                      vfs_opterror(opts, "too many IPv6 addresses");
                                      goto done_errmsg;
                              }
                              ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
                              bcopy(op, ip6, ip6s * sizeof(*ip6));
                              if (ip6s > 1)
                                      qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
                                          prison_qcmp_v6);
                              for (ii = 0; ii < ip6s; ii++) {
                                      if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
                                              error = EINVAL;
                                              goto done_free;
                                      }
                                      if ((ii+1) < ip6s &&
                                          (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
                                           IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
                                      {
                                              error = EINVAL;
                                              goto done_free;
                                      }
                              }
                      }
              }
      #endif
      
      #if defined(VIMAGE) && (defined(INET) || defined(INET6))
              if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
                      error = EINVAL;
                      vfs_opterror(opts,
                          "vnet jails cannot have IP address restrictions");
                      goto done_errmsg;
              }
      #endif
      
              error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
              if (error == ENOENT)
                      osrelstr = NULL;
              else if (error != 0)
                      goto done_free;
              else {
                      if (flags & JAIL_UPDATE) {
                              error = EINVAL;
                              vfs_opterror(opts,
                                  "osrelease cannot be changed after creation");
                              goto done_errmsg;
                      }
                      if (len == 0 || osrelstr[len - 1] != '\0') {
                              error = EINVAL;
                              goto done_free;
                      }
                      if (len >= OSRELEASELEN) {
                              error = ENAMETOOLONG;
                              vfs_opterror(opts,
                                  "osrelease string must be 1-%d bytes long",
                                  OSRELEASELEN - 1);
                              goto done_errmsg;
                      }
              }
      
              error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
              if (error == ENOENT)
                      osreldt = 0;
              else if (error != 0)
                      goto done_free;
              else {
                      if (flags & JAIL_UPDATE) {
                              error = EINVAL;
                              vfs_opterror(opts,
                                  "osreldate cannot be changed after creation");
                              goto done_errmsg;
                      }
                      if (osreldt == 0) {
                              error = EINVAL;
                              vfs_opterror(opts, "osreldate cannot be 0");
                              goto done_errmsg;
                      }
              }
      
              root = NULL;
              error = vfs_getopt(opts, "path", (void **)&path, &len);
              if (error == ENOENT)
                      path = NULL;
              else if (error != 0)
                      goto done_free;
              else {
                      if (flags & JAIL_UPDATE) {
                              error = EINVAL;
                              vfs_opterror(opts,
                                  "path cannot be changed after creation");
                              goto done_errmsg;
                      }
                      if (len == 0 || path[len - 1] != '\0') {
                              error = EINVAL;
                              goto done_free;
                      }
                      NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
                          path, td);
                      error = namei(&nd);
                      if (error)
                              goto done_free;
                      root = nd.ni_vp;
                      NDFREE(&nd, NDF_ONLY_PNBUF);
                      g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
                      strlcpy(g_path, path, MAXPATHLEN);
                      error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
                      if (error == 0) {
                              path = g_path;
                      } else {
                              /* exit on other errors */
                              goto done_free;
                      }
                      if (root->v_type != VDIR) {
                              error = ENOTDIR;
                              vput(root);
                              goto done_free;
                      }
                      VOP_UNLOCK(root);
              }
      
              /*
               * Find the specified jail, or at least its parent.
               * This abuses the file error codes ENOENT and EEXIST.
               */
              pr = NULL;
              ppr = mypr;
              if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
                      namelc = strrchr(name, '.');
                      jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
                      if (*p != '\0')
                              jid = 0;
              }
              sx_xlock(&allprison_lock);
              if (jid != 0) {
                      /*
                       * See if a requested jid already exists.  There is an
                       * information leak here if the jid exists but is not within
                       * the caller's jail hierarchy.  Jail creators will get EEXIST
                       * even though they cannot see the jail, and CREATE | UPDATE
                       * will return ENOENT which is not normally a valid error.
                       */
                      if (jid < 0) {
                              error = EINVAL;
                              vfs_opterror(opts, "negative jid");
                              goto done_unlock_list;
                      }
                      pr = prison_find(jid);
                      if (pr != NULL) {
                              ppr = pr->pr_parent;
                              /* Create: jid must not exist. */
                              if (cuflags == JAIL_CREATE) {
                                      mtx_unlock(&pr->pr_mtx);
                                      error = EEXIST;
                                      vfs_opterror(opts, "jail %d already exists",
                                          jid);
                                      goto done_unlock_list;
                              }
                              if (!prison_ischild(mypr, pr)) {
                                      mtx_unlock(&pr->pr_mtx);
                                      pr = NULL;
                              } else if (pr->pr_uref == 0) {
                                      if (!(flags & JAIL_DYING)) {
                                              mtx_unlock(&pr->pr_mtx);
                                              error = ENOENT;
                                              vfs_opterror(opts, "jail %d is dying",
                                                  jid);
                                              goto done_unlock_list;
                                      } else if ((flags & JAIL_ATTACH) ||
                                          (pr_flags & PR_PERSIST)) {
                                              /*
                                               * A dying jail might be resurrected
                                               * (via attach or persist), but first
                                               * it must determine if another jail
                                               * has claimed its name.  Accomplish
                                               * this by implicitly re-setting the
                                               * name.
                                               */
                                              if (name == NULL)
                                                      name = prison_name(mypr, pr);
                                      }
                              }
                      }
                      if (pr == NULL) {
                              /* Update: jid must exist. */
                              if (cuflags == JAIL_UPDATE) {
                                      error = ENOENT;
                                      vfs_opterror(opts, "jail %d not found", jid);
                                      goto done_unlock_list;
                              }
                      }
              }
              /*
               * If the caller provided a name, look for a jail by that name.
               * This has different semantics for creates and updates keyed by jid
               * (where the name must not already exist in a different jail),
               * and updates keyed by the name itself (where the name must exist
               * because that is the jail being updated).
               */
              namelc = NULL;
              if (name != NULL) {
                      namelc = strrchr(name, '.');
                      if (namelc == NULL)
                              namelc = name;
                      else {
                              /*
                               * This is a hierarchical name.  Split it into the
                               * parent and child names, and make sure the parent
                               * exists or matches an already found jail.
                               */
                              if (pr != NULL) {
                                      if (strncmp(name, ppr->pr_name, namelc - name)
                                          || ppr->pr_name[namelc - name] != '\0') {
                                              mtx_unlock(&pr->pr_mtx);
                                              error = EINVAL;
                                              vfs_opterror(opts,
                                                  "cannot change jail's parent");
                                              goto done_unlock_list;
                                      }
                              } else {
                                      *namelc = '\0';
                                      ppr = prison_find_name(mypr, name);
                                      if (ppr == NULL) {
                                              error = ENOENT;
                                              vfs_opterror(opts,
                                                  "jail \"%s\" not found", name);
                                              goto done_unlock_list;
                                      }
                                      mtx_unlock(&ppr->pr_mtx);
                                      *namelc = '.';
                              }
                              namelc++;
                      }
                      if (namelc[0] != '\0') {
                              pnamelen =
                                  (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
       name_again:
                              deadpr = NULL;
                              FOREACH_PRISON_CHILD(ppr, tpr) {
                                      if (tpr != pr && tpr->pr_ref > 0 &&
                                          !strcmp(tpr->pr_name + pnamelen, namelc)) {
                                              if (pr == NULL &&
                                                  cuflags != JAIL_CREATE) {
                                                      mtx_lock(&tpr->pr_mtx);
                                                      if (tpr->pr_ref > 0) {
                                                              /*
                                                               * Use this jail
                                                               * for updates.
                                                               */
                                                              if (tpr->pr_uref > 0) {
                                                                      pr = tpr;
                                                                      break;
                                                              }
                                                              deadpr = tpr;
                                                      }
                                                      mtx_unlock(&tpr->pr_mtx);
                                              } else if (tpr->pr_uref > 0) {
                                                      /*
                                                       * Create, or update(jid):
                                                       * name must not exist in an
                                                       * active sibling jail.
                                                       */
                                                      error = EEXIST;
                                                      if (pr != NULL)
                                                              mtx_unlock(&pr->pr_mtx);
                                                      vfs_opterror(opts,
                                                         "jail \"%s\" already exists",
                                                         name);
                                                      goto done_unlock_list;
                                              }
                                      }
                              }
                              /* If no active jail is found, use a dying one. */
                              if (deadpr != NULL && pr == NULL) {
                                      if (flags & JAIL_DYING) {
                                              mtx_lock(&deadpr->pr_mtx);
                                              if (deadpr->pr_ref == 0) {
                                                      mtx_unlock(&deadpr->pr_mtx);
                                                      goto name_again;
                                              }
                                              pr = deadpr;
                                      } else if (cuflags == JAIL_UPDATE) {
                                              error = ENOENT;
                                              vfs_opterror(opts,
                                                  "jail \"%s\" is dying", name);
                                              goto done_unlock_list;
                                      }
                              }
                              /* Update: name must exist if no jid. */
                              else if (cuflags == JAIL_UPDATE && pr == NULL) {
                                      error = ENOENT;
                                      vfs_opterror(opts, "jail \"%s\" not found",
                                          name);
                                      goto done_unlock_list;
                              }
                      }
              }
              /* Update: must provide a jid or name. */
              else if (cuflags == JAIL_UPDATE && pr == NULL) {
                      error = ENOENT;
                      vfs_opterror(opts, "update specified no jail");
                      goto done_unlock_list;
              }
      
              /* If there's no prison to update, create a new one and link it in. */
              if (pr == NULL) {
                      for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
                              if (tpr->pr_childcount >= tpr->pr_childmax) {
                                      error = EPERM;
                                      vfs_opterror(opts, "prison limit exceeded");
                                      goto done_unlock_list;
                              }
                      created = 1;
                      mtx_lock(&ppr->pr_mtx);
                      if (ppr->pr_ref == 0) {
                              mtx_unlock(&ppr->pr_mtx);
                              error = ENOENT;
                              vfs_opterror(opts, "jail \"%s\" not found",
                                  prison_name(mypr, ppr));
                              goto done_unlock_list;
                      }
                      ppr->pr_ref++;
                      ppr->pr_uref++;
                      mtx_unlock(&ppr->pr_mtx);
                      pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
                      if (jid == 0) {
                              /* Find the next free jid. */
                              jid = lastprid + 1;
       findnext:
                              if (jid == JAIL_MAX)
                                      jid = 1;
                              TAILQ_FOREACH(tpr, &allprison, pr_list) {
                                      if (tpr->pr_id < jid)
                                              continue;
                                      if (tpr->pr_id > jid || tpr->pr_ref == 0) {
                                              TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
                                              break;
                                      }
                                      if (jid == lastprid) {
                                              error = EAGAIN;
                                              vfs_opterror(opts,
                                                  "no available jail IDs");
                                              free(pr, M_PRISON);
                                              prison_deref(ppr, PD_DEREF |
                                                  PD_DEUREF | PD_LIST_XLOCKED);
                                              goto done_releroot;
                                      }
                                      jid++;
                                      goto findnext;
                              }
                              lastprid = jid;
                      } else {
                              /*
                               * The jail already has a jid (that did not yet exist),
                               * so just find where to insert it.
                               */
                              TAILQ_FOREACH(tpr, &allprison, pr_list)
                                      if (tpr->pr_id >= jid) {
                                              TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
                                              break;
                                      }
                      }
                      if (tpr == NULL)
                              TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
                      LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
                      for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
                              tpr->pr_childcount++;
      
                      pr->pr_parent = ppr;
                      pr->pr_id = jid;
      
                      /* Set some default values, and inherit some from the parent. */
                      if (namelc == NULL)
                              namelc = "";
                      if (path == NULL) {
                              path = "/";
                              root = mypr->pr_root;
                              vref(root);
                      }
                      strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
                      pr->pr_flags |= PR_HOST;
      #if defined(INET) || defined(INET6)
      #ifdef VIMAGE
                      if (!(pr_flags & PR_VNET))
      #endif
                      {
      #ifdef INET
                              if (!(ch_flags & PR_IP4_USER))
                                      pr->pr_flags |= PR_IP4 | PR_IP4_USER;
                              else if (!(pr_flags & PR_IP4_USER)) {
                                      pr->pr_flags |= ppr->pr_flags & PR_IP4;
                                      if (ppr->pr_ip4 != NULL) {
                                              pr->pr_ip4s = ppr->pr_ip4s;
                                              pr->pr_ip4 = malloc(pr->pr_ip4s *
                                                  sizeof(struct in_addr), M_PRISON,
                                                  M_WAITOK);
                                              bcopy(ppr->pr_ip4, pr->pr_ip4,
                                                  pr->pr_ip4s * sizeof(*pr->pr_ip4));
                                      }
                              }
      #endif
      #ifdef INET6
                              if (!(ch_flags & PR_IP6_USER))
                                      pr->pr_flags |= PR_IP6 | PR_IP6_USER;
                              else if (!(pr_flags & PR_IP6_USER)) {
                                      pr->pr_flags |= ppr->pr_flags & PR_IP6;
                                      if (ppr->pr_ip6 != NULL) {
                                              pr->pr_ip6s = ppr->pr_ip6s;
                                              pr->pr_ip6 = malloc(pr->pr_ip6s *
                                                  sizeof(struct in6_addr), M_PRISON,
                                                  M_WAITOK);
                                              bcopy(ppr->pr_ip6, pr->pr_ip6,
                                                  pr->pr_ip6s * sizeof(*pr->pr_ip6));
                                      }
                              }
      #endif
                      }
      #endif
                      /* Source address selection is always on by default. */
                      pr->pr_flags |= _PR_IP_SADDRSEL;
      
                      pr->pr_securelevel = ppr->pr_securelevel;
                      pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
                      pr->pr_enforce_statfs = jail_default_enforce_statfs;
                      pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
      
                      pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
                      if (osrelstr == NULL)
                              strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
                                  sizeof(pr->pr_osrelease));
                      else
                              strlcpy(pr->pr_osrelease, osrelstr,
                                  sizeof(pr->pr_osrelease));
      
                      LIST_INIT(&pr->pr_children);
                      mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
                      TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
      
      #ifdef VIMAGE
                      /* Allocate a new vnet if specified. */
                      pr->pr_vnet = (pr_flags & PR_VNET)
                          ? vnet_alloc() : ppr->pr_vnet;
      #endif
                      /*
                       * Allocate a dedicated cpuset for each jail.
                       * Unlike other initial settings, this may return an erorr.
                       */
                      error = cpuset_create_root(ppr, &pr->pr_cpuset);
                      if (error) {
                              prison_deref(pr, PD_LIST_XLOCKED);
                              goto done_releroot;
                      }
      
                      mtx_lock(&pr->pr_mtx);
                      /*
                       * New prisons do not yet have a reference, because we do not
                       * want others to see the incomplete prison once the
                       * allprison_lock is downgraded.
                       */
              } else {
                      created = 0;
                      /*
                       * Grab a reference for existing prisons, to ensure they
                       * continue to exist for the duration of the call.
                       */
                      pr->pr_ref++;
      #if defined(VIMAGE) && (defined(INET) || defined(INET6))
                      if ((pr->pr_flags & PR_VNET) &&
                          (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
                              error = EINVAL;
                              vfs_opterror(opts,
                                  "vnet jails cannot have IP address restrictions");
                              goto done_deref_locked;
                      }
      #endif
      #ifdef INET
                      if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
                              error = EINVAL;
                              vfs_opterror(opts,
                                  "ip4 cannot be changed after creation");
                              goto done_deref_locked;
                      }
      #endif
      #ifdef INET6
                      if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
                              error = EINVAL;
                              vfs_opterror(opts,
                                  "ip6 cannot be changed after creation");
                              goto done_deref_locked;
                      }
      #endif
              }
      
              /* Do final error checking before setting anything. */
              if (gotslevel) {
                      if (slevel < ppr->pr_securelevel) {
                              error = EPERM;
                              goto done_deref_locked;
                      }
              }
              if (gotchildmax) {
                      if (childmax >= ppr->pr_childmax) {
                              error = EPERM;
                              goto done_deref_locked;
                      }
              }
              if (gotenforce) {
                      if (enforce < ppr->pr_enforce_statfs) {
                              error = EPERM;
                              goto done_deref_locked;
                      }
              }
              if (gotrsnum) {
                      /*
                       * devfs_rsnum is a uint16_t
                       */
                      if (rsnum < 0 || rsnum > 65535) {
                              error = EINVAL;
                              goto done_deref_locked;
                      }
                      /*
                       * Nested jails always inherit parent's devfs ruleset
                       */
                      if (jailed(td->td_ucred)) {
                              if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
                                      error = EPERM;
                                      goto done_deref_locked;
                              } else
                                      rsnum = ppr->pr_devfs_rsnum;
                      }
              }
      #ifdef INET
              if (ip4s > 0) {
                      if (ppr->pr_flags & PR_IP4) {
                              /*
                               * Make sure the new set of IP addresses is a
                               * subset of the parent's list.  Don't worry
                               * about the parent being unlocked, as any
                               * setting is done with allprison_lock held.
                               */
                              for (ij = 0; ij < ppr->pr_ip4s; ij++)
                                      if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
                                              break;
                              if (ij == ppr->pr_ip4s) {
                                      error = EPERM;
                                      goto done_deref_locked;
                              }
                              if (ip4s > 1) {
                                      for (ii = ij = 1; ii < ip4s; ii++) {
                                              if (ip4[ii].s_addr ==
                                                  ppr->pr_ip4[0].s_addr)
                                                      continue;
                                              for (; ij < ppr->pr_ip4s; ij++)
                                                      if (ip4[ii].s_addr ==
                                                          ppr->pr_ip4[ij].s_addr)
                                                              break;
                                              if (ij == ppr->pr_ip4s)
                                                      break;
                                      }
                                      if (ij == ppr->pr_ip4s) {
                                              error = EPERM;
                                              goto done_deref_locked;
                                      }
                              }
                      }
                      /*
                       * Check for conflicting IP addresses.  We permit them
                       * if there is no more than one IP on each jail.  If
                       * there is a duplicate on a jail with more than one
                       * IP stop checking and return error.
                       */
      #ifdef VIMAGE
                      for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
                              if (tppr->pr_flags & PR_VNET)
                                      break;
      #else
                      tppr = &prison0;
      #endif
                      FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
                              if (tpr == pr ||
      #ifdef VIMAGE
                                  (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
      #endif
                                  tpr->pr_uref == 0) {
                                      descend = 0;
                                      continue;
                              }
                              if (!(tpr->pr_flags & PR_IP4_USER))
                                      continue;
                              descend = 0;
                              if (tpr->pr_ip4 == NULL ||
                                  (ip4s == 1 && tpr->pr_ip4s == 1))
                                      continue;
                              for (ii = 0; ii < ip4s; ii++) {
                                      if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
                                          0) {
                                              error = EADDRINUSE;
                                              vfs_opterror(opts,
                                                  "IPv4 addresses clash");
                                              goto done_deref_locked;
                                      }
                              }
                      }
              }
      #endif
      #ifdef INET6
              if (ip6s > 0) {
                      if (ppr->pr_flags & PR_IP6) {
                              /*
                               * Make sure the new set of IP addresses is a
                               * subset of the parent's list.
                               */
                              for (ij = 0; ij < ppr->pr_ip6s; ij++)
                                      if (IN6_ARE_ADDR_EQUAL(&ip6[0],
                                          &ppr->pr_ip6[ij]))
                                              break;
                              if (ij == ppr->pr_ip6s) {
                                      error = EPERM;
                                      goto done_deref_locked;
                              }
                              if (ip6s > 1) {
                                      for (ii = ij = 1; ii < ip6s; ii++) {
                                              if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
                                                   &ppr->pr_ip6[0]))
                                                      continue;
                                              for (; ij < ppr->pr_ip6s; ij++)
                                                      if (IN6_ARE_ADDR_EQUAL(
                                                          &ip6[ii], &ppr->pr_ip6[ij]))
                                                              break;
                                              if (ij == ppr->pr_ip6s)
                                                      break;
                                      }
                                      if (ij == ppr->pr_ip6s) {
                                              error = EPERM;
                                              goto done_deref_locked;
                                      }
                              }
                      }
                      /* Check for conflicting IP addresses. */
      #ifdef VIMAGE
                      for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
                              if (tppr->pr_flags & PR_VNET)
                                      break;
      #else
                      tppr = &prison0;
      #endif
                      FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
                              if (tpr == pr ||
      #ifdef VIMAGE
                                  (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
      #endif
                                  tpr->pr_uref == 0) {
                                      descend = 0;
                                      continue;
                              }
                              if (!(tpr->pr_flags & PR_IP6_USER))
                                      continue;
                              descend = 0;
                              if (tpr->pr_ip6 == NULL ||
                                  (ip6s == 1 && tpr->pr_ip6s == 1))
                                      continue;
                              for (ii = 0; ii < ip6s; ii++) {
                                      if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
                                          0) {
                                              error = EADDRINUSE;
                                              vfs_opterror(opts,
                                                  "IPv6 addresses clash");
                                              goto done_deref_locked;
                                      }
                              }
                      }
              }
      #endif
              onamelen = namelen = 0;
              if (namelc != NULL) {
                      /* Give a default name of the jid.  Also allow the name to be
                       * explicitly the jid - but not any other number, and only in
                       * normal form (no leading zero/etc).
                       */
                      if (namelc[0] == '\0')
                              snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
                      else if ((strtoul(namelc, &p, 10) != jid ||
                                namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
                              error = EINVAL;
                              vfs_opterror(opts,
                                  "name cannot be numeric (unless it is the jid)");
                              goto done_deref_locked;
                      }
                      /*
                       * Make sure the name isn't too long for the prison or its
                       * children.
                       */
                      pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
                      onamelen = strlen(pr->pr_name + pnamelen);
                      namelen = strlen(namelc);
                      if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
                              error = ENAMETOOLONG;
                              goto done_deref_locked;
                      }
                      FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
                              if (strlen(tpr->pr_name) + (namelen - onamelen) >=
                                  sizeof(pr->pr_name)) {
                                      error = ENAMETOOLONG;
                                      goto done_deref_locked;
                              }
                      }
              }
              pr_allow_diff = pr_allow & ~ppr->pr_allow;
              if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
                      error = EPERM;
                      goto done_deref_locked;
              }
      
              /*
               * Let modules check their parameters.  This requires unlocking and
               * then re-locking the prison, but this is still a valid state as long
               * as allprison_lock remains xlocked.
               */
              mtx_unlock(&pr->pr_mtx);
              error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
              if (error != 0) {
                      prison_deref(pr, created
                          ? PD_LIST_XLOCKED
                          : PD_DEREF | PD_LIST_XLOCKED);
                      goto done_releroot;
              }
              mtx_lock(&pr->pr_mtx);
      
              /* At this point, all valid parameters should have been noted. */
              TAILQ_FOREACH(opt, opts, link) {
                      if (!opt->seen && strcmp(opt->name, "errmsg")) {
                              error = EINVAL;
                              vfs_opterror(opts, "unknown parameter: %s", opt->name);
                              goto done_deref_locked;
                      }
              }
      
              /* Set the parameters of the prison. */
      #ifdef INET
              redo_ip4 = 0;
              if (pr_flags & PR_IP4_USER) {
                      pr->pr_flags |= PR_IP4;
                      free(pr->pr_ip4, M_PRISON);
                      pr->pr_ip4s = ip4s;
                      pr->pr_ip4 = ip4;
                      ip4 = NULL;
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
      #ifdef VIMAGE
                              if (tpr->pr_flags & PR_VNET) {
                                      descend = 0;
                                      continue;
                              }
      #endif
                              if (prison_restrict_ip4(tpr, NULL)) {
                                      redo_ip4 = 1;
                                      descend = 0;
                              }
                      }
              }
      #endif
      #ifdef INET6
              redo_ip6 = 0;
              if (pr_flags & PR_IP6_USER) {
                      pr->pr_flags |= PR_IP6;
                      free(pr->pr_ip6, M_PRISON);
                      pr->pr_ip6s = ip6s;
                      pr->pr_ip6 = ip6;
                      ip6 = NULL;
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
      #ifdef VIMAGE
                              if (tpr->pr_flags & PR_VNET) {
                                      descend = 0;
                                      continue;
                              }
      #endif
                              if (prison_restrict_ip6(tpr, NULL)) {
                                      redo_ip6 = 1;
                                      descend = 0;
                              }
                      }
              }
      #endif
              if (gotslevel) {
                      pr->pr_securelevel = slevel;
                      /* Set all child jails to be at least this level. */
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
                              if (tpr->pr_securelevel < slevel)
                                      tpr->pr_securelevel = slevel;
              }
              if (gotchildmax) {
                      pr->pr_childmax = childmax;
                      /* Set all child jails to under this limit. */
                      FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
                              if (tpr->pr_childmax > childmax - level)
                                      tpr->pr_childmax = childmax > level
                                          ? childmax - level : 0;
              }
              if (gotenforce) {
                      pr->pr_enforce_statfs = enforce;
                      /* Pass this restriction on to the children. */
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
                              if (tpr->pr_enforce_statfs < enforce)
                                      tpr->pr_enforce_statfs = enforce;
              }
              if (gotrsnum) {
                      pr->pr_devfs_rsnum = rsnum;
                      /* Pass this restriction on to the children. */
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
                              tpr->pr_devfs_rsnum = rsnum;
              }
              if (namelc != NULL) {
                      if (ppr == &prison0)
                              strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
                      else
                              snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
                                  ppr->pr_name, namelc);
                      /* Change this component of child names. */
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
                              bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
                                  strlen(tpr->pr_name + onamelen) + 1);
                              bcopy(pr->pr_name, tpr->pr_name, namelen);
                      }
              }
              if (path != NULL) {
                      /* Try to keep a real-rooted full pathname. */
                      strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
                      pr->pr_root = root;
              }
              if (PR_HOST & ch_flags & ~pr_flags) {
                      if (pr->pr_flags & PR_HOST) {
                              /*
                               * Copy the parent's host info.  As with pr_ip4 above,
                               * the lack of a lock on the parent is not a problem;
                               * it is always set with allprison_lock at least
                               * shared, and is held exclusively here.
                               */
                              strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
                                  sizeof(pr->pr_hostname));
                              strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
                                  sizeof(pr->pr_domainname));
                              strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
                                  sizeof(pr->pr_hostuuid));
                              pr->pr_hostid = pr->pr_parent->pr_hostid;
                      }
              } else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
                      /* Set this prison, and any descendants without PR_HOST. */
                      if (host != NULL)
                              strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
                      if (domain != NULL)
                              strlcpy(pr->pr_domainname, domain, 
                                  sizeof(pr->pr_domainname));
                      if (uuid != NULL)
                              strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
                      if (gothid)
                              pr->pr_hostid = hid;
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
                              if (tpr->pr_flags & PR_HOST)
                                      descend = 0;
                              else {
                                      if (host != NULL)
                                              strlcpy(tpr->pr_hostname,
                                                  pr->pr_hostname,
                                                  sizeof(tpr->pr_hostname));
                                      if (domain != NULL)
                                              strlcpy(tpr->pr_domainname, 
                                                  pr->pr_domainname,
                                                  sizeof(tpr->pr_domainname));
                                      if (uuid != NULL)
                                              strlcpy(tpr->pr_hostuuid,
                                                  pr->pr_hostuuid,
                                                  sizeof(tpr->pr_hostuuid));
                                      if (gothid)
                                              tpr->pr_hostid = hid;
                              }
                      }
              }
              if ((tallow = ch_allow & ~pr_allow)) {
                      /* Clear allow bits in all children. */
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
                              tpr->pr_allow &= ~tallow;
              }
              pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
              /*
               * Persistent prisons get an extra reference, and prisons losing their
               * persist flag lose that reference.  Only do this for existing prisons
               * for now, so new ones will remain unseen until after the module
               * handlers have completed.
               */
              born = pr->pr_uref == 0;
              if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
                      if (pr_flags & PR_PERSIST) {
                              pr->pr_ref++;
                              pr->pr_uref++;
                      } else {
                              pr->pr_ref--;
                              pr->pr_uref--;
                      }
              }
              pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
              mtx_unlock(&pr->pr_mtx);
      
      #ifdef RACCT
              if (racct_enable && created)
                      prison_racct_attach(pr);
      #endif
      
              /* Locks may have prevented a complete restriction of child IP
               * addresses.  If so, allocate some more memory and try again.
               */
      #ifdef INET
              while (redo_ip4) {
                      ip4s = pr->pr_ip4s;
                      ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
                      mtx_lock(&pr->pr_mtx);
                      redo_ip4 = 0;
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
      #ifdef VIMAGE
                              if (tpr->pr_flags & PR_VNET) {
                                      descend = 0;
                                      continue;
                              }
      #endif
                              if (prison_restrict_ip4(tpr, ip4)) {
                                      if (ip4 != NULL)
                                              ip4 = NULL;
                                      else
                                              redo_ip4 = 1;
                              }
                      }
                      mtx_unlock(&pr->pr_mtx);
              }
      #endif
      #ifdef INET6
              while (redo_ip6) {
                      ip6s = pr->pr_ip6s;
                      ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
                      mtx_lock(&pr->pr_mtx);
                      redo_ip6 = 0;
                      FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
      #ifdef VIMAGE
                              if (tpr->pr_flags & PR_VNET) {
                                      descend = 0;
                                      continue;
                              }
      #endif
                              if (prison_restrict_ip6(tpr, ip6)) {
                                      if (ip6 != NULL)
                                              ip6 = NULL;
                                      else
                                              redo_ip6 = 1;
                              }
                      }
                      mtx_unlock(&pr->pr_mtx);
              }
      #endif
      
              /* Let the modules do their work. */
              sx_downgrade(&allprison_lock);
              if (born) {
                      error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
                      if (error) {
                              (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
                              prison_deref(pr, created
                                  ? PD_LIST_SLOCKED
                                  : PD_DEREF | PD_LIST_SLOCKED);
                              goto done_errmsg;
                      }
              }
              error = osd_jail_call(pr, PR_METHOD_SET, opts);
              if (error) {
                      if (born)
                              (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
                      prison_deref(pr, created
                          ? PD_LIST_SLOCKED
                          : PD_DEREF | PD_LIST_SLOCKED);
                      goto done_errmsg;
              }
      
              /* Attach this process to the prison if requested. */
              if (flags & JAIL_ATTACH) {
                      mtx_lock(&pr->pr_mtx);
                      error = do_jail_attach(td, pr);
                      if (error) {
                              vfs_opterror(opts, "attach failed");
                              if (!created)
                                      prison_deref(pr, PD_DEREF);
                              goto done_errmsg;
                      }
              }
      
      #ifdef RACCT
              if (racct_enable && !created) {
                      if (!(flags & JAIL_ATTACH))
                              sx_sunlock(&allprison_lock);
                      prison_racct_modify(pr);
                      if (!(flags & JAIL_ATTACH))
                              sx_slock(&allprison_lock);
              }
      #endif
      
              td->td_retval[0] = pr->pr_id;
      
              /*
               * Now that it is all there, drop the temporary reference from existing
               * prisons.  Or add a reference to newly created persistent prisons
               * (which was not done earlier so that the prison would not be publicly
               * visible).
               */
              if (!created) {
                      prison_deref(pr, (flags & JAIL_ATTACH)
                          ? PD_DEREF
                          : PD_DEREF | PD_LIST_SLOCKED);
              } else {
                      if (pr_flags & PR_PERSIST) {
                              mtx_lock(&pr->pr_mtx);
                              pr->pr_ref++;
                              pr->pr_uref++;
                              mtx_unlock(&pr->pr_mtx);
                      }
                      if (!(flags & JAIL_ATTACH))
                              sx_sunlock(&allprison_lock);
              }
      
              goto done_free;
      
       done_deref_locked:
              prison_deref(pr, created
                  ? PD_LOCKED | PD_LIST_XLOCKED
                  : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
              goto done_releroot;
       done_unlock_list:
              sx_xunlock(&allprison_lock);
       done_releroot:
              if (root != NULL)
                      vrele(root);
       done_errmsg:
              if (error) {
                      if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
                          &errmsg_len) == 0 && errmsg_len > 0) {
                              errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
                              if (optuio->uio_segflg == UIO_SYSSPACE)
                                      bcopy(errmsg,
                                          optuio->uio_iov[errmsg_pos].iov_base,
                                          errmsg_len);
                              else
                                      copyout(errmsg,
                                          optuio->uio_iov[errmsg_pos].iov_base,
                                          errmsg_len);
                      }
              }
       done_free:
      #ifdef INET
              free(ip4, M_PRISON);
      #endif
      #ifdef INET6
              free(ip6, M_PRISON);
      #endif
              if (g_path != NULL)
                      free(g_path, M_TEMP);
              vfs_freeopts(opts);
              return (error);
      }
      
      /*
       * struct jail_get_args {
       *        struct iovec *iovp;
       *        unsigned int iovcnt;
       *        int flags;
       * };
       */
      int
      sys_jail_get(struct thread *td, struct jail_get_args *uap)
      {
              struct uio *auio;
              int error;
      
              /* Check that we have an even number of iovecs. */
              if (uap->iovcnt & 1)
                      return (EINVAL);
      
              error = copyinuio(uap->iovp, uap->iovcnt, &auio);
              if (error)
                      return (error);
              error = kern_jail_get(td, auio, uap->flags);
              if (error == 0)
                      error = copyout(auio->uio_iov, uap->iovp,
                          uap->iovcnt * sizeof (struct iovec));
              free(auio, M_IOV);
              return (error);
      }
      
      int
      kern_jail_get(struct thread *td, struct uio *optuio, int flags)
      {
              struct bool_flags *bf;
              struct jailsys_flags *jsf;
              struct prison *pr, *mypr;
              struct vfsopt *opt;
              struct vfsoptlist *opts;
              char *errmsg, *name;
              int error, errmsg_len, errmsg_pos, i, jid, len, locked, pos;
              unsigned f;
      
              if (flags & ~JAIL_GET_MASK)
                      return (EINVAL);
      
              /* Get the parameter list. */
              error = vfs_buildopts(optuio, &opts);
              if (error)
                      return (error);
              errmsg_pos = vfs_getopt_pos(opts, "errmsg");
              mypr = td->td_ucred->cr_prison;
      
              /*
               * Find the prison specified by one of: lastjid, jid, name.
               */
              sx_slock(&allprison_lock);
              error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
              if (error == 0) {
                      TAILQ_FOREACH(pr, &allprison, pr_list) {
                              if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
                                      mtx_lock(&pr->pr_mtx);
                                      if (pr->pr_ref > 0 &&
                                          (pr->pr_uref > 0 || (flags & JAIL_DYING)))
                                              break;
                                      mtx_unlock(&pr->pr_mtx);
                              }
                      }
                      if (pr != NULL)
                              goto found_prison;
                      error = ENOENT;
                      vfs_opterror(opts, "no jail after %d", jid);
                      goto done_unlock_list;
              } else if (error != ENOENT)
                      goto done_unlock_list;
      
              error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
              if (error == 0) {
                      if (jid != 0) {
                              pr = prison_find_child(mypr, jid);
                              if (pr != NULL) {
                                      if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
                                              mtx_unlock(&pr->pr_mtx);
                                              error = ENOENT;
                                              vfs_opterror(opts, "jail %d is dying",
                                                  jid);
                                              goto done_unlock_list;
                                      }
                                      goto found_prison;
                              }
                              error = ENOENT;
                              vfs_opterror(opts, "jail %d not found", jid);
                              goto done_unlock_list;
                      }
              } else if (error != ENOENT)
                      goto done_unlock_list;
      
              error = vfs_getopt(opts, "name", (void **)&name, &len);
              if (error == 0) {
                      if (len == 0 || name[len - 1] != '\0') {
                              error = EINVAL;
                              goto done_unlock_list;
                      }
                      pr = prison_find_name(mypr, name);
                      if (pr != NULL) {
                              if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
                                      mtx_unlock(&pr->pr_mtx);
                                      error = ENOENT;
                                      vfs_opterror(opts, "jail \"%s\" is dying",
                                          name);
                                      goto done_unlock_list;
                              }
                              goto found_prison;
                      }
                      error = ENOENT;
                      vfs_opterror(opts, "jail \"%s\" not found", name);
                      goto done_unlock_list;
              } else if (error != ENOENT)
                      goto done_unlock_list;
      
              vfs_opterror(opts, "no jail specified");
              error = ENOENT;
              goto done_unlock_list;
      
       found_prison:
              /* Get the parameters of the prison. */
              pr->pr_ref++;
              locked = PD_LOCKED;
              td->td_retval[0] = pr->pr_id;
              error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
              error = vfs_setopt(opts, "parent", &i, sizeof(i));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopts(opts, "name", prison_name(mypr, pr));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
                  sizeof(pr->pr_cpuset->cs_id));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopts(opts, "path", prison_path(mypr, pr));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
      #ifdef INET
              error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
                  pr->pr_ip4s * sizeof(*pr->pr_ip4));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
      #endif
      #ifdef INET6
              error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
                  pr->pr_ip6s * sizeof(*pr->pr_ip6));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
      #endif
              error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
                  sizeof(pr->pr_securelevel));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
                  sizeof(pr->pr_childcount));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
                  sizeof(pr->pr_childmax));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
              if (error != 0 && error != ENOENT)
                      goto done_deref;
      #ifdef COMPAT_FREEBSD32
              if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
                      uint32_t hid32 = pr->pr_hostid;
      
                      error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
              } else
      #endif
              error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
                  sizeof(pr->pr_hostid));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
                  sizeof(pr->pr_enforce_statfs));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
                  sizeof(pr->pr_devfs_rsnum));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              for (bf = pr_flag_bool;
                   bf < pr_flag_bool + nitems(pr_flag_bool);
                   bf++) {
                      i = (pr->pr_flags & bf->flag) ? 1 : 0;
                      error = vfs_setopt(opts, bf->name, &i, sizeof(i));
                      if (error != 0 && error != ENOENT)
                              goto done_deref;
                      i = !i;
                      error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
                      if (error != 0 && error != ENOENT)
                              goto done_deref;
              }
              for (jsf = pr_flag_jailsys;
                   jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
                   jsf++) {
                      f = pr->pr_flags & (jsf->disable | jsf->new);
                      i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
                          : (f == jsf->new) ? JAIL_SYS_NEW
                          : JAIL_SYS_INHERIT;
                      error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
                      if (error != 0 && error != ENOENT)
                              goto done_deref;
              }
              for (bf = pr_flag_allow;
                   bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
                   bf++) {
                      i = (pr->pr_allow & bf->flag) ? 1 : 0;
                      error = vfs_setopt(opts, bf->name, &i, sizeof(i));
                      if (error != 0 && error != ENOENT)
                              goto done_deref;
                      i = !i;
                      error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
                      if (error != 0 && error != ENOENT)
                              goto done_deref;
              }
              i = (pr->pr_uref == 0);
              error = vfs_setopt(opts, "dying", &i, sizeof(i));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              i = !i;
              error = vfs_setopt(opts, "nodying", &i, sizeof(i));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
                  sizeof(pr->pr_osreldate));
              if (error != 0 && error != ENOENT)
                      goto done_deref;
              error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
              if (error != 0 && error != ENOENT)
                      goto done_deref;
      
              /* Get the module parameters. */
              mtx_unlock(&pr->pr_mtx);
              locked = 0;
              error = osd_jail_call(pr, PR_METHOD_GET, opts);
              if (error)
                      goto done_deref;
              prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
      
              /* By now, all parameters should have been noted. */
              TAILQ_FOREACH(opt, opts, link) {
                      if (!opt->seen && strcmp(opt->name, "errmsg")) {
                              error = EINVAL;
                              vfs_opterror(opts, "unknown parameter: %s", opt->name);
                              goto done_errmsg;
                      }
              }
      
              /* Write the fetched parameters back to userspace. */
              error = 0;
              TAILQ_FOREACH(opt, opts, link) {
                      if (opt->pos >= 0 && opt->pos != errmsg_pos) {
                              pos = 2 * opt->pos + 1;
                              optuio->uio_iov[pos].iov_len = opt->len;
                              if (opt->value != NULL) {
                                      if (optuio->uio_segflg == UIO_SYSSPACE) {
                                              bcopy(opt->value,
                                                  optuio->uio_iov[pos].iov_base,
                                                  opt->len);
                                      } else {
                                              error = copyout(opt->value,
                                                  optuio->uio_iov[pos].iov_base,
                                                  opt->len);
                                              if (error)
                                                      break;
                                      }
                              }
                      }
              }
              goto done_errmsg;
      
       done_deref:
              prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
              goto done_errmsg;
      
       done_unlock_list:
              sx_sunlock(&allprison_lock);
       done_errmsg:
              if (error && errmsg_pos >= 0) {
                      vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
                      errmsg_pos = 2 * errmsg_pos + 1;
                      if (errmsg_len > 0) {
                              if (optuio->uio_segflg == UIO_SYSSPACE)
                                      bcopy(errmsg,
                                          optuio->uio_iov[errmsg_pos].iov_base,
                                          errmsg_len);
                              else
                                      copyout(errmsg,
                                          optuio->uio_iov[errmsg_pos].iov_base,
                                          errmsg_len);
                      }
              }
              vfs_freeopts(opts);
              return (error);
      }
      
      /*
       * struct jail_remove_args {
       *        int jid;
       * };
       */
      int
      sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
      {
              struct prison *pr, *cpr, *lpr, *tpr;
              int descend, error;
      
              error = priv_check(td, PRIV_JAIL_REMOVE);
              if (error)
                      return (error);
      
              sx_xlock(&allprison_lock);
              pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
              if (pr == NULL) {
                      sx_xunlock(&allprison_lock);
                      return (EINVAL);
              }
      
              /* Remove all descendants of this prison, then remove this prison. */
              pr->pr_ref++;
              if (!LIST_EMPTY(&pr->pr_children)) {
                      mtx_unlock(&pr->pr_mtx);
                      lpr = NULL;
                      FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
                              mtx_lock(&cpr->pr_mtx);
                              if (cpr->pr_ref > 0) {
                                      tpr = cpr;
                                      cpr->pr_ref++;
                              } else {
                                      /* Already removed - do not do it again. */
                                      tpr = NULL;
                              }
                              mtx_unlock(&cpr->pr_mtx);
                              if (lpr != NULL) {
                                      mtx_lock(&lpr->pr_mtx);
                                      prison_remove_one(lpr);
                                      sx_xlock(&allprison_lock);
                              }
                              lpr = tpr;
                      }
                      if (lpr != NULL) {
                              mtx_lock(&lpr->pr_mtx);
                              prison_remove_one(lpr);
                              sx_xlock(&allprison_lock);
                      }
                      mtx_lock(&pr->pr_mtx);
              }
              prison_remove_one(pr);
              return (0);
      }
      
      static void
      prison_remove_one(struct prison *pr)
      {
              struct proc *p;
              int deuref;
      
              /* If the prison was persistent, it is not anymore. */
              deuref = 0;
              if (pr->pr_flags & PR_PERSIST) {
                      pr->pr_ref--;
                      deuref = PD_DEUREF;
                      pr->pr_flags &= ~PR_PERSIST;
              }
      
              /*
               * jail_remove added a reference.  If that's the only one, remove
               * the prison now.
               */
              KASSERT(pr->pr_ref > 0,
                  ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
              if (pr->pr_ref == 1) {
                      prison_deref(pr,
                          deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
                      return;
              }
      
              mtx_unlock(&pr->pr_mtx);
              sx_xunlock(&allprison_lock);
              /*
               * Kill all processes unfortunate enough to be attached to this prison.
               */
              sx_slock(&allproc_lock);
              FOREACH_PROC_IN_SYSTEM(p) {
                      PROC_LOCK(p);
                      if (p->p_state != PRS_NEW && p->p_ucred &&
                          p->p_ucred->cr_prison == pr)
                              kern_psignal(p, SIGKILL);
                      PROC_UNLOCK(p);
              }
              sx_sunlock(&allproc_lock);
              /* Remove the temporary reference added by jail_remove. */
              prison_deref(pr, deuref | PD_DEREF);
      }
      
      /*
       * struct jail_attach_args {
       *        int jid;
       * };
       */
      int
      sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
      {
              struct prison *pr;
              int error;
      
              error = priv_check(td, PRIV_JAIL_ATTACH);
              if (error)
                      return (error);
      
              /*
               * Start with exclusive hold on allprison_lock to ensure that a possible
               * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
               * But then immediately downgrade it since we don't need to stop
               * readers.
               */
              sx_xlock(&allprison_lock);
              sx_downgrade(&allprison_lock);
              pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
              if (pr == NULL) {
                      sx_sunlock(&allprison_lock);
                      return (EINVAL);
              }
      
              /*
               * Do not allow a process to attach to a prison that is not
               * considered to be "alive".
               */
              if (pr->pr_uref == 0) {
                      mtx_unlock(&pr->pr_mtx);
                      sx_sunlock(&allprison_lock);
                      return (EINVAL);
              }
      
              return (do_jail_attach(td, pr));
      }
      
      static int
      do_jail_attach(struct thread *td, struct prison *pr)
      {
              struct proc *p;
              struct ucred *newcred, *oldcred;
              int error;
      
              /*
               * XXX: Note that there is a slight race here if two threads
               * in the same privileged process attempt to attach to two
               * different jails at the same time.  It is important for
               * user processes not to do this, or they might end up with
               * a process root from one prison, but attached to the jail
               * of another.
               */
              pr->pr_ref++;
              pr->pr_uref++;
              mtx_unlock(&pr->pr_mtx);
      
              /* Let modules do whatever they need to prepare for attaching. */
              error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
              if (error) {
                      prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
                      return (error);
              }
              sx_sunlock(&allprison_lock);
      
              /*
               * Reparent the newly attached process to this jail.
               */
              p = td->td_proc;
              error = cpuset_setproc_update_set(p, pr->pr_cpuset);
              if (error)
                      goto e_revert_osd;
      
              vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
              if ((error = change_dir(pr->pr_root, td)) != 0)
                      goto e_unlock;
      #ifdef MAC
              if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
                      goto e_unlock;
      #endif
              VOP_UNLOCK(pr->pr_root);
              if ((error = pwd_chroot(td, pr->pr_root)))
                      goto e_revert_osd;
      
              newcred = crget();
              PROC_LOCK(p);
              oldcred = crcopysafe(p, newcred);
              newcred->cr_prison = pr;
              proc_set_cred(p, newcred);
              setsugid(p);
      #ifdef RACCT
              racct_proc_ucred_changed(p, oldcred, newcred);
              crhold(newcred);
      #endif
              PROC_UNLOCK(p);
      #ifdef RCTL
              rctl_proc_ucred_changed(p, newcred);
              crfree(newcred);
      #endif
              prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
              crfree(oldcred);
              return (0);
      
       e_unlock:
              VOP_UNLOCK(pr->pr_root);
       e_revert_osd:
              /* Tell modules this thread is still in its old jail after all. */
              (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
              prison_deref(pr, PD_DEREF | PD_DEUREF);
              return (error);
      }
      
      /*
       * Returns a locked prison instance, or NULL on failure.
       */
      struct prison *
      prison_find(int prid)
      {
              struct prison *pr;
      
              sx_assert(&allprison_lock, SX_LOCKED);
              TAILQ_FOREACH(pr, &allprison, pr_list) {
                      if (pr->pr_id == prid) {
                              mtx_lock(&pr->pr_mtx);
                              if (pr->pr_ref > 0)
                                      return (pr);
                              mtx_unlock(&pr->pr_mtx);
                      }
              }
              return (NULL);
      }
      
      /*
       * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
       */
      struct prison *
      prison_find_child(struct prison *mypr, int prid)
      {
              struct prison *pr;
              int descend;
      
              sx_assert(&allprison_lock, SX_LOCKED);
              FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
                      if (pr->pr_id == prid) {
                              mtx_lock(&pr->pr_mtx);
                              if (pr->pr_ref > 0)
                                      return (pr);
                              mtx_unlock(&pr->pr_mtx);
                      }
              }
              return (NULL);
      }
      
      /*
       * Look for the name relative to mypr.  Returns a locked prison or NULL.
       */
      struct prison *
      prison_find_name(struct prison *mypr, const char *name)
      {
              struct prison *pr, *deadpr;
              size_t mylen;
              int descend;
      
              sx_assert(&allprison_lock, SX_LOCKED);
              mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
       again:
              deadpr = NULL;
              FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
                      if (!strcmp(pr->pr_name + mylen, name)) {
                              mtx_lock(&pr->pr_mtx);
                              if (pr->pr_ref > 0) {
                                      if (pr->pr_uref > 0)
                                              return (pr);
                                      deadpr = pr;
                              }
                              mtx_unlock(&pr->pr_mtx);
                      }
              }
              /* There was no valid prison - perhaps there was a dying one. */
              if (deadpr != NULL) {
                      mtx_lock(&deadpr->pr_mtx);
                      if (deadpr->pr_ref == 0) {
                              mtx_unlock(&deadpr->pr_mtx);
                              goto again;
                      }
              }
              return (deadpr);
      }
      
      /*
       * See if a prison has the specific flag set.
       */
      int
      prison_flag(struct ucred *cred, unsigned flag)
 1001 {
      
              /* This is an atomic read, so no locking is necessary. */
              return (cred->cr_prison->pr_flags & flag);
      }
      
      int
      prison_allow(struct ucred *cred, unsigned flag)
      {
      
              /* This is an atomic read, so no locking is necessary. */
              return (cred->cr_prison->pr_allow & flag);
      }
      
      /*
       * Remove a prison reference.  If that was the last reference, remove the
       * prison itself - but not in this context in case there are locks held.
       */
      void
      prison_free_locked(struct prison *pr)
      {
              int ref;
      
              mtx_assert(&pr->pr_mtx, MA_OWNED);
              ref = --pr->pr_ref;
              mtx_unlock(&pr->pr_mtx);
    9         if (ref == 0)
                      taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
      }
      
      void
      prison_free(struct prison *pr)
    9 {
      
              mtx_lock(&pr->pr_mtx);
              prison_free_locked(pr);
      }
      
      /*
       * Complete a call to either prison_free or prison_proc_free.
       */
      static void
      prison_complete(void *context, int pending)
      {
              struct prison *pr = context;
      
              sx_xlock(&allprison_lock);
              mtx_lock(&pr->pr_mtx);
              prison_deref(pr, pr->pr_uref
                  ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
                  : PD_LOCKED | PD_LIST_XLOCKED);
      }
      
      /*
       * Remove a prison reference (usually).  This internal version assumes no
       * mutexes are held, except perhaps the prison itself.  If there are no more
       * references, release and delist the prison.  On completion, the prison lock
       * and the allprison lock are both unlocked.
       */
      static void
      prison_deref(struct prison *pr, int flags)
      {
              struct prison *ppr, *tpr;
              int ref, lasturef;
      
              if (!(flags & PD_LOCKED))
                      mtx_lock(&pr->pr_mtx);
              for (;;) {
                      if (flags & PD_DEUREF) {
                              KASSERT(pr->pr_uref > 0,
                                  ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
                                   pr->pr_id));
                              pr->pr_uref--;
                              lasturef = pr->pr_uref == 0;
                              if (lasturef)
                                      pr->pr_ref++;
                              KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
                      } else
                              lasturef = 0;
                      if (flags & PD_DEREF) {
                              KASSERT(pr->pr_ref > 0,
                                  ("prison_deref PD_DEREF on a dead prison (jid=%d)",
                                   pr->pr_id));
                              pr->pr_ref--;
                      }
                      ref = pr->pr_ref;
                      mtx_unlock(&pr->pr_mtx);
      
                      /*
                       * Tell the modules if the last user reference was removed
                       * (even it sticks around in dying state).
                       */
                      if (lasturef) {
                              if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
                                      sx_xlock(&allprison_lock);
                                      flags |= PD_LIST_XLOCKED;
                              }
                              (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
                              mtx_lock(&pr->pr_mtx);
                              ref = --pr->pr_ref;
                              mtx_unlock(&pr->pr_mtx);
                      }
      
                      /* If the prison still has references, nothing else to do. */
                      if (ref > 0) {
                              if (flags & PD_LIST_SLOCKED)
                                      sx_sunlock(&allprison_lock);
                              else if (flags & PD_LIST_XLOCKED)
                                      sx_xunlock(&allprison_lock);
                              return;
                      }
      
                      if (flags & PD_LIST_SLOCKED) {
                              if (!sx_try_upgrade(&allprison_lock)) {
                                      sx_sunlock(&allprison_lock);
                                      sx_xlock(&allprison_lock);
                              }
                      } else if (!(flags & PD_LIST_XLOCKED))
                              sx_xlock(&allprison_lock);
      
                      TAILQ_REMOVE(&allprison, pr, pr_list);
                      LIST_REMOVE(pr, pr_sibling);
                      ppr = pr->pr_parent;
                      for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
                              tpr->pr_childcount--;
                      sx_xunlock(&allprison_lock);
      
      #ifdef VIMAGE
                      if (pr->pr_vnet != ppr->pr_vnet)
                              vnet_destroy(pr->pr_vnet);
      #endif
                      if (pr->pr_root != NULL)
                              vrele(pr->pr_root);
                      mtx_destroy(&pr->pr_mtx);
      #ifdef INET
                      free(pr->pr_ip4, M_PRISON);
      #endif
      #ifdef INET6
                      free(pr->pr_ip6, M_PRISON);
      #endif
                      if (pr->pr_cpuset != NULL)
                              cpuset_rel(pr->pr_cpuset);
                      osd_jail_exit(pr);
      #ifdef RACCT
                      if (racct_enable)
                              prison_racct_detach(pr);
      #endif
                      free(pr, M_PRISON);
      
                      /* Removing a prison frees a reference on its parent. */
                      pr = ppr;
                      mtx_lock(&pr->pr_mtx);
                      flags = PD_DEREF | PD_DEUREF;
              }
      }
      
      void
      prison_hold_locked(struct prison *pr)
      {
      
              mtx_assert(&pr->pr_mtx, MA_OWNED);
              KASSERT(pr->pr_ref > 0,
                  ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
   62         pr->pr_ref++;
      }
      
      void
      prison_hold(struct prison *pr)
   62 {
      
              mtx_lock(&pr->pr_mtx);
              prison_hold_locked(pr);
              mtx_unlock(&pr->pr_mtx);
      }
      
      void
      prison_proc_hold(struct prison *pr)
      {
      
              mtx_lock(&pr->pr_mtx);
              KASSERT(pr->pr_uref > 0,
                  ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
              pr->pr_uref++;
              mtx_unlock(&pr->pr_mtx);
      }
      
      void
      prison_proc_free(struct prison *pr)
      {
      
              mtx_lock(&pr->pr_mtx);
              KASSERT(pr->pr_uref > 0,
                  ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
              if (pr->pr_uref > 1)
                      pr->pr_uref--;
              else {
                      /*
                       * Don't remove the last user reference in this context, which
                       * is expected to be a process that is not only locked, but
                       * also half dead.
                       */
                      pr->pr_ref++;
                      mtx_unlock(&pr->pr_mtx);
                      taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
                      return;
              }
              mtx_unlock(&pr->pr_mtx);
      }
      
      /*
       * Check if a jail supports the given address family.
       *
       * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
       * if not.
       */
      int
      prison_check_af(struct ucred *cred, int af)
   25 {
              struct prison *pr;
              int error;
      
              KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
      
              pr = cred->cr_prison;
      #ifdef VIMAGE
              /* Prisons with their own network stack are not limited. */
   25         if (prison_owns_vnet(cred))
                      return (0);
      #endif
      
              error = 0;
              switch (af)
              {
      #ifdef INET
              case AF_INET:
                      if (pr->pr_flags & PR_IP4)
                      {
                              mtx_lock(&pr->pr_mtx);
                              if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
                                      error = EAFNOSUPPORT;
                              mtx_unlock(&pr->pr_mtx);
                      }
                      break;
      #endif
      #ifdef INET6
              case AF_INET6:
                      if (pr->pr_flags & PR_IP6)
                      {
                              mtx_lock(&pr->pr_mtx);
                              if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
                                      error = EAFNOSUPPORT;
                              mtx_unlock(&pr->pr_mtx);
                      }
                      break;
      #endif
              case AF_LOCAL:
              case AF_ROUTE:
                      break;
              default:
                      if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
                              error = EAFNOSUPPORT;
              }
              return (error);
      }
      
      /*
       * Check if given address belongs to the jail referenced by cred (wrapper to
       * prison_check_ip[46]).
       *
       * Returns 0 if jail doesn't restrict the address family or if address belongs
       * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
       * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
       */
      int
      prison_if(struct ucred *cred, const struct sockaddr *sa)
    4 {
      #ifdef INET
              const struct sockaddr_in *sai;
      #endif
      #ifdef INET6
              const struct sockaddr_in6 *sai6;
      #endif
              int error;
      
              KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
              KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
      
      #ifdef VIMAGE
    4         if (prison_owns_vnet(cred))
                      return (0);
      #endif
      
              error = 0;
              switch (sa->sa_family)
              {
      #ifdef INET
              case AF_INET:
                      sai = (const struct sockaddr_in *)sa;
                      error = prison_check_ip4(cred, &sai->sin_addr);
                      break;
      #endif
      #ifdef INET6
              case AF_INET6:
                      sai6 = (const struct sockaddr_in6 *)sa;
                      error = prison_check_ip6(cred, &sai6->sin6_addr);
                      break;
      #endif
              default:
                      if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
                              error = EAFNOSUPPORT;
              }
              return (error);
      }
      
      /*
       * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
       */
      int
      prison_check(struct ucred *cred1, struct ucred *cred2)
    2 {
      
    2         return ((cred1->cr_prison == cred2->cr_prison ||
                  prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
      }
      
      /*
       * Return 1 if p2 is a child of p1, otherwise 0.
       */
      int
      prison_ischild(struct prison *pr1, struct prison *pr2)
      {
      
              for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
                      if (pr1 == pr2)
                              return (1);
              return (0);
      }
      
      /*
       * Return 1 if the passed credential is in a jail and that jail does not
       * have its own virtual network stack, otherwise 0.
       */
      int
      jailed_without_vnet(struct ucred *cred)
   19 {
      
   19         if (!jailed(cred))
                      return (0);
      #ifdef VIMAGE
              if (prison_owns_vnet(cred))
                      return (0);
      #endif
      
              return (1);
      }
      
      /*
       * Return the correct hostname (domainname, et al) for the passed credential.
       */
      void
      getcredhostname(struct ucred *cred, char *buf, size_t size)
      {
              struct prison *pr;
      
              /*
               * A NULL credential can be used to shortcut to the physical
               * system's hostname.
               */
              pr = (cred != NULL) ? cred->cr_prison : &prison0;
              mtx_lock(&pr->pr_mtx);
              strlcpy(buf, pr->pr_hostname, size);
              mtx_unlock(&pr->pr_mtx);
      }
      
      void
      getcreddomainname(struct ucred *cred, char *buf, size_t size)
      {
      
              mtx_lock(&cred->cr_prison->pr_mtx);
              strlcpy(buf, cred->cr_prison->pr_domainname, size);
              mtx_unlock(&cred->cr_prison->pr_mtx);
      }
      
      void
      getcredhostuuid(struct ucred *cred, char *buf, size_t size)
      {
      
              mtx_lock(&cred->cr_prison->pr_mtx);
              strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
              mtx_unlock(&cred->cr_prison->pr_mtx);
      }
      
      void
      getcredhostid(struct ucred *cred, unsigned long *hostid)
      {
      
              mtx_lock(&cred->cr_prison->pr_mtx);
              *hostid = cred->cr_prison->pr_hostid;
              mtx_unlock(&cred->cr_prison->pr_mtx);
      }
      
      #ifdef VIMAGE
      /*
       * Determine whether the prison represented by cred owns
       * its vnet rather than having it inherited.
       *
       * Returns 1 in case the prison owns the vnet, 0 otherwise.
       */
      int
      prison_owns_vnet(struct ucred *cred)
      {
      
              /*
               * vnets cannot be added/removed after jail creation,
               * so no need to lock here.
               */
              return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
      }
      #endif
      
      /*
       * Determine whether the subject represented by cred can "see"
       * status of a mount point.
       * Returns: 0 for permitted, ENOENT otherwise.
       * XXX: This function should be called cr_canseemount() and should be
       *      placed in kern_prot.c.
       */
      int
      prison_canseemount(struct ucred *cred, struct mount *mp)
      {
              struct prison *pr;
              struct statfs *sp;
              size_t len;
      
              pr = cred->cr_prison;
              if (pr->pr_enforce_statfs == 0)
                      return (0);
              if (pr->pr_root->v_mount == mp)
                      return (0);
              if (pr->pr_enforce_statfs == 2)
                      return (ENOENT);
              /*
               * If jail's chroot directory is set to "/" we should be able to see
               * all mount-points from inside a jail.
               * This is ugly check, but this is the only situation when jail's
               * directory ends with '/'.
               */
              if (strcmp(pr->pr_path, "/") == 0)
                      return (0);
              len = strlen(pr->pr_path);
              sp = &mp->mnt_stat;
              if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
                      return (ENOENT);
              /*
               * Be sure that we don't have situation where jail's root directory
               * is "/some/path" and mount point is "/some/pathpath".
               */
              if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
                      return (ENOENT);
              return (0);
      }
      
      void
      prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
      {
              char jpath[MAXPATHLEN];
              struct prison *pr;
              size_t len;
      
              pr = cred->cr_prison;
              if (pr->pr_enforce_statfs == 0)
                      return;
              if (prison_canseemount(cred, mp) != 0) {
                      bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
                      strlcpy(sp->f_mntonname, "[restricted]",
                          sizeof(sp->f_mntonname));
                      return;
              }
              if (pr->pr_root->v_mount == mp) {
                      /*
                       * Clear current buffer data, so we are sure nothing from
                       * the valid path left there.
                       */
                      bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
                      *sp->f_mntonname = '/';
                      return;
              }
              /*
               * If jail's chroot directory is set to "/" we should be able to see
               * all mount-points from inside a jail.
               */
              if (strcmp(pr->pr_path, "/") == 0)
                      return;
              len = strlen(pr->pr_path);
              strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
              /*
               * Clear current buffer data, so we are sure nothing from
               * the valid path left there.
               */
              bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
              if (*jpath == '\0') {
                      /* Should never happen. */
                      *sp->f_mntonname = '/';
              } else {
                      strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
              }
      }
      
      /*
       * Check with permission for a specific privilege is granted within jail.  We
       * have a specific list of accepted privileges; the rest are denied.
       */
      int
      prison_priv_check(struct ucred *cred, int priv)
 2924 {
      
              /*
               * Some policies have custom handlers. This routine should not be
               * called for them. See priv_check_cred().
               */
              switch (priv) {
              case PRIV_VFS_GENERATION:
                      KASSERT(0, ("prison_priv_check instead of a custom handler "
                          "called for %d\n", priv));
              }
      
 2926         if (!jailed(cred))
                      return (0);
      
      #ifdef VIMAGE
              /*
               * Privileges specific to prisons with a virtual network stack.
               * There might be a duplicate entry here in case the privilege
               * is only granted conditionally in the legacy jail case.
               */
              switch (priv) {
      #ifdef notyet
                      /*
                       * NFS-specific privileges.
                       */
              case PRIV_NFS_DAEMON:
              case PRIV_NFS_LOCKD:
      #endif
                      /*
                       * Network stack privileges.
                       */
              case PRIV_NET_BRIDGE:
              case PRIV_NET_GRE:
              case PRIV_NET_BPF:
              case PRIV_NET_RAW:                /* Dup, cond. in legacy jail case. */
              case PRIV_NET_ROUTE:
              case PRIV_NET_TAP:
              case PRIV_NET_SETIFMTU:
              case PRIV_NET_SETIFFLAGS:
              case PRIV_NET_SETIFCAP:
              case PRIV_NET_SETIFDESCR:
              case PRIV_NET_SETIFNAME        :
              case PRIV_NET_SETIFMETRIC:
              case PRIV_NET_SETIFPHYS:
              case PRIV_NET_SETIFMAC:
              case PRIV_NET_SETLANPCP:
              case PRIV_NET_ADDMULTI:
              case PRIV_NET_DELMULTI:
              case PRIV_NET_HWIOCTL:
              case PRIV_NET_SETLLADDR:
              case PRIV_NET_ADDIFGROUP:
              case PRIV_NET_DELIFGROUP:
              case PRIV_NET_IFCREATE:
              case PRIV_NET_IFDESTROY:
              case PRIV_NET_ADDIFADDR:
              case PRIV_NET_DELIFADDR:
              case PRIV_NET_LAGG:
              case PRIV_NET_GIF:
              case PRIV_NET_SETIFVNET:
              case PRIV_NET_SETIFFIB:
      
                      /*
                       * 802.11-related privileges.
                       */
              case PRIV_NET80211_GETKEY:
      #ifdef notyet
              case PRIV_NET80211_MANAGE:                /* XXX-BZ discuss with sam@ */
      #endif
      
      #ifdef notyet
                      /*
                       * ATM privileges.
                       */
              case PRIV_NETATM_CFG:
              case PRIV_NETATM_ADD:
              case PRIV_NETATM_DEL:
              case PRIV_NETATM_SET:
      
                      /*
                       * Bluetooth privileges.
                       */
              case PRIV_NETBLUETOOTH_RAW:
      #endif
      
                      /*
                       * Netgraph and netgraph module privileges.
                       */
              case PRIV_NETGRAPH_CONTROL:
      #ifdef notyet
              case PRIV_NETGRAPH_TTY:
      #endif
      
                      /*
                       * IPv4 and IPv6 privileges.
                       */
              case PRIV_NETINET_IPFW:
              case PRIV_NETINET_DIVERT:
              case PRIV_NETINET_PF:
              case PRIV_NETINET_DUMMYNET:
              case PRIV_NETINET_CARP:
              case PRIV_NETINET_MROUTE:
              case PRIV_NETINET_RAW:
              case PRIV_NETINET_ADDRCTRL6:
              case PRIV_NETINET_ND6:
              case PRIV_NETINET_SCOPE6:
              case PRIV_NETINET_ALIFETIME6:
              case PRIV_NETINET_IPSEC:
              case PRIV_NETINET_BINDANY:
      
      #ifdef notyet
                      /*
                       * NCP privileges.
                       */
              case PRIV_NETNCP:
      
                      /*
                       * SMB privileges.
                       */
              case PRIV_NETSMB:
      #endif
      
              /*
               * No default: or deny here.
               * In case of no permit fall through to next switch().
               */
                      if (cred->cr_prison->pr_flags & PR_VNET)
                              return (0);
              }
      #endif /* VIMAGE */
      
              switch (priv) {
      
                      /*
                       * Allow ktrace privileges for root in jail.
                       */
              case PRIV_KTRACE:
      
      #if 0
                      /*
                       * Allow jailed processes to configure audit identity and
                       * submit audit records (login, etc).  In the future we may
                       * want to further refine the relationship between audit and
                       * jail.
                       */
              case PRIV_AUDIT_GETAUDIT:
              case PRIV_AUDIT_SETAUDIT:
              case PRIV_AUDIT_SUBMIT:
      #endif
      
                      /*
                       * Allow jailed processes to manipulate process UNIX
                       * credentials in any way they see fit.
                       */
              case PRIV_CRED_SETUID:
              case PRIV_CRED_SETEUID:
              case PRIV_CRED_SETGID:
              case PRIV_CRED_SETEGID:
              case PRIV_CRED_SETGROUPS:
              case PRIV_CRED_SETREUID:
              case PRIV_CRED_SETREGID:
              case PRIV_CRED_SETRESUID:
              case PRIV_CRED_SETRESGID:
      
                      /*
                       * Jail implements visibility constraints already, so allow
                       * jailed root to override uid/gid-based constraints.
                       */
              case PRIV_SEEOTHERGIDS:
              case PRIV_SEEOTHERUIDS:
      
                      /*
                       * Jail implements inter-process debugging limits already, so
                       * allow jailed root various debugging privileges.
                       */
              case PRIV_DEBUG_DIFFCRED:
              case PRIV_DEBUG_SUGID:
              case PRIV_DEBUG_UNPRIV:
      
                      /*
                       * Allow jail to set various resource limits and login
                       * properties, and for now, exceed process resource limits.
                       */
              case PRIV_PROC_LIMIT:
              case PRIV_PROC_SETLOGIN:
              case PRIV_PROC_SETRLIMIT:
      
                      /*
                       * System V and POSIX IPC privileges are granted in jail.
                       */
              case PRIV_IPC_READ:
              case PRIV_IPC_WRITE:
              case PRIV_IPC_ADMIN:
              case PRIV_IPC_MSGSIZE:
              case PRIV_MQ_ADMIN:
      
                      /*
                       * Jail operations within a jail work on child jails.
                       */
              case PRIV_JAIL_ATTACH:
              case PRIV_JAIL_SET:
              case PRIV_JAIL_REMOVE:
      
                      /*
                       * Jail implements its own inter-process limits, so allow
                       * root processes in jail to change scheduling on other
                       * processes in the same jail.  Likewise for signalling.
                       */
              case PRIV_SCHED_DIFFCRED:
              case PRIV_SCHED_CPUSET:
              case PRIV_SIGNAL_DIFFCRED:
              case PRIV_SIGNAL_SUGID:
      
                      /*
                       * Allow jailed processes to write to sysctls marked as jail
                       * writable.
                       */
              case PRIV_SYSCTL_WRITEJAIL:
      
                      /*
                       * Allow root in jail to manage a variety of quota
                       * properties.  These should likely be conditional on a
                       * configuration option.
                       */
              case PRIV_VFS_GETQUOTA:
              case PRIV_VFS_SETQUOTA:
      
                      /*
                       * Since Jail relies on chroot() to implement file system
                       * protections, grant many VFS privileges to root in jail.
                       * Be careful to exclude mount-related and NFS-related
                       * privileges.
                       */
              case PRIV_VFS_READ:
              case PRIV_VFS_WRITE:
              case PRIV_VFS_ADMIN:
              case PRIV_VFS_EXEC:
              case PRIV_VFS_LOOKUP:
              case PRIV_VFS_BLOCKRESERVE:        /* XXXRW: Slightly surprising. */
              case PRIV_VFS_CHFLAGS_DEV:
              case PRIV_VFS_CHOWN:
              case PRIV_VFS_CHROOT:
              case PRIV_VFS_RETAINSUGID:
              case PRIV_VFS_FCHROOT:
              case PRIV_VFS_LINK:
              case PRIV_VFS_SETGID:
              case PRIV_VFS_STAT:
              case PRIV_VFS_STICKYFILE:
      
                      /*
                       * As in the non-jail case, non-root users are expected to be
                       * able to read kernel/phyiscal memory (provided /dev/[k]mem
                       * exists in the jail and they have permission to access it).
                       */
              case PRIV_KMEM_READ:
                      return (0);
      
                      /*
                       * Depending on the global setting, allow privilege of
                       * setting system flags.
                       */
              case PRIV_VFS_SYSFLAGS:
                      if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
                              return (0);
                      else
                              return (EPERM);
      
                      /*
                       * Depending on the global setting, allow privilege of
                       * mounting/unmounting file systems.
                       */
              case PRIV_VFS_MOUNT:
              case PRIV_VFS_UNMOUNT:
              case PRIV_VFS_MOUNT_NONUSER:
              case PRIV_VFS_MOUNT_OWNER:
                      if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
                          cred->cr_prison->pr_enforce_statfs < 2)
                              return (0);
                      else
                              return (EPERM);
      
                      /*
                       * Conditionnaly allow locking (unlocking) physical pages
                       * in memory.
                       */
              case PRIV_VM_MLOCK:
              case PRIV_VM_MUNLOCK:
                      if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
                              return (0);
                      else
                              return (EPERM);
      
                      /*
                       * Conditionally allow jailed root to bind reserved ports.
                       */
              case PRIV_NETINET_RESERVEDPORT:
                      if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
                              return (0);
                      else
                              return (EPERM);
      
                      /*
                       * Allow jailed root to reuse in-use ports.
                       */
              case PRIV_NETINET_REUSEPORT:
                      return (0);
      
                      /*
                       * Allow jailed root to set certain IPv4/6 (option) headers.
                       */
              case PRIV_NETINET_SETHDROPTS:
                      return (0);
      
                      /*
                       * Conditionally allow creating raw sockets in jail.
                       */
              case PRIV_NETINET_RAW:
                      if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
                              return (0);
                      else
                              return (EPERM);
      
                      /*
                       * Since jail implements its own visibility limits on netstat
                       * sysctls, allow getcred.  This allows identd to work in
                       * jail.
                       */
              case PRIV_NETINET_GETCRED:
                      return (0);
      
                      /*
                       * Allow jailed root to set loginclass.
                       */
              case PRIV_PROC_SETLOGINCLASS:
                      return (0);
      
                      /*
                       * Do not allow a process inside a jail to read the kernel
                       * message buffer unless explicitly permitted.
                       */
              case PRIV_MSGBUF:
                      if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
                              return (0);
                      return (EPERM);
      
              default:
                      /*
                       * In all remaining cases, deny the privilege request.  This
                       * includes almost all network privileges, many system
                       * configuration privileges.
                       */
                      return (EPERM);
              }
      }
      
      /*
       * Return the part of pr2's name that is relative to pr1, or the whole name
       * if it does not directly follow.
       */
      
      char *
      prison_name(struct prison *pr1, struct prison *pr2)
      {
              char *name;
      
              /* Jails see themselves as "0" (if they see themselves at all). */
              if (pr1 == pr2)
                      return "0";
              name = pr2->pr_name;
              if (prison_ischild(pr1, pr2)) {
                      /*
                       * pr1 isn't locked (and allprison_lock may not be either)
                       * so its length can't be counted on.  But the number of dots
                       * can be counted on - and counted.
                       */
                      for (; pr1 != &prison0; pr1 = pr1->pr_parent)
                              name = strchr(name, '.') + 1;
              }
              return (name);
      }
      
      /*
       * Return the part of pr2's path that is relative to pr1, or the whole path
       * if it does not directly follow.
       */
      static char *
      prison_path(struct prison *pr1, struct prison *pr2)
      {
              char *path1, *path2;
              int len1;
      
              path1 = pr1->pr_path;
              path2 = pr2->pr_path;
              if (!strcmp(path1, "/"))
                      return (path2);
              len1 = strlen(path1);
              if (strncmp(path1, path2, len1))
                      return (path2);
              if (path2[len1] == '\0')
                      return "/";
              if (path2[len1] == '/')
                      return (path2 + len1);
              return (path2);
      }
      
      /*
       * Jail-related sysctls.
       */
      static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Jails");
      
      static int
      sysctl_jail_list(SYSCTL_HANDLER_ARGS)
      {
              struct xprison *xp;
              struct prison *pr, *cpr;
      #ifdef INET
              struct in_addr *ip4 = NULL;
              int ip4s = 0;
      #endif
      #ifdef INET6
              struct in6_addr *ip6 = NULL;
              int ip6s = 0;
      #endif
              int descend, error;
      
              xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
              pr = req->td->td_ucred->cr_prison;
              error = 0;
              sx_slock(&allprison_lock);
              FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
      #if defined(INET) || defined(INET6)
       again:
      #endif
                      mtx_lock(&cpr->pr_mtx);
      #ifdef INET
                      if (cpr->pr_ip4s > 0) {
                              if (ip4s < cpr->pr_ip4s) {
                                      ip4s = cpr->pr_ip4s;
                                      mtx_unlock(&cpr->pr_mtx);
                                      ip4 = realloc(ip4, ip4s *
                                          sizeof(struct in_addr), M_TEMP, M_WAITOK);
                                      goto again;
                              }
                              bcopy(cpr->pr_ip4, ip4,
                                  cpr->pr_ip4s * sizeof(struct in_addr));
                      }
      #endif
      #ifdef INET6
                      if (cpr->pr_ip6s > 0) {
                              if (ip6s < cpr->pr_ip6s) {
                                      ip6s = cpr->pr_ip6s;
                                      mtx_unlock(&cpr->pr_mtx);
                                      ip6 = realloc(ip6, ip6s *
                                          sizeof(struct in6_addr), M_TEMP, M_WAITOK);
                                      goto again;
                              }
                              bcopy(cpr->pr_ip6, ip6,
                                  cpr->pr_ip6s * sizeof(struct in6_addr));
                      }
      #endif
                      if (cpr->pr_ref == 0) {
                              mtx_unlock(&cpr->pr_mtx);
                              continue;
                      }
                      bzero(xp, sizeof(*xp));
                      xp->pr_version = XPRISON_VERSION;
                      xp->pr_id = cpr->pr_id;
                      xp->pr_state = cpr->pr_uref > 0
                          ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
                      strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
                      strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
                      strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
      #ifdef INET
                      xp->pr_ip4s = cpr->pr_ip4s;
      #endif
      #ifdef INET6
                      xp->pr_ip6s = cpr->pr_ip6s;
      #endif
                      mtx_unlock(&cpr->pr_mtx);
                      error = SYSCTL_OUT(req, xp, sizeof(*xp));
                      if (error)
                              break;
      #ifdef INET
                      if (xp->pr_ip4s > 0) {
                              error = SYSCTL_OUT(req, ip4,
                                  xp->pr_ip4s * sizeof(struct in_addr));
                              if (error)
                                      break;
                      }
      #endif
      #ifdef INET6
                      if (xp->pr_ip6s > 0) {
                              error = SYSCTL_OUT(req, ip6,
                                  xp->pr_ip6s * sizeof(struct in6_addr));
                              if (error)
                                      break;
                      }
      #endif
              }
              sx_sunlock(&allprison_lock);
              free(xp, M_TEMP);
      #ifdef INET
              free(ip4, M_TEMP);
      #endif
      #ifdef INET6
              free(ip6, M_TEMP);
      #endif
              return (error);
      }
      
      SYSCTL_OID(_security_jail, OID_AUTO, list,
          CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
          sysctl_jail_list, "S", "List of active jails");
      
      static int
      sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
      {
              int error, injail;
      
              injail = jailed(req->td->td_ucred);
              error = SYSCTL_OUT(req, &injail, sizeof(injail));
      
              return (error);
      }
      
      SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
          CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
          sysctl_jail_jailed, "I", "Process in jail?");
      
      static int
      sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
      {
              int error, havevnet;
      #ifdef VIMAGE
              struct ucred *cred = req->td->td_ucred;
      
              havevnet = jailed(cred) && prison_owns_vnet(cred);
      #else
              havevnet = 0;
      #endif
              error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
      
              return (error);
      }
      
      SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
          CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
          sysctl_jail_vnet, "I", "Jail owns vnet?");
      
      #if defined(INET) || defined(INET6)
      SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
          &jail_max_af_ips, 0,
          "Number of IP addresses a jail may have at most per address family (deprecated)");
      #endif
      
      /*
       * Default parameters for jail(2) compatibility.  For historical reasons,
       * the sysctl names have varying similarity to the parameter names.  Prisons
       * just see their own parameters, and can't change them.
       */
      static int
      sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
      {
              struct prison *pr;
              int allow, error, i;
      
              pr = req->td->td_ucred->cr_prison;
              allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
      
              /* Get the current flag value, and convert it to a boolean. */
              i = (allow & arg2) ? 1 : 0;
              if (arg1 != NULL)
                      i = !i;
              error = sysctl_handle_int(oidp, &i, 0, req);
              if (error || !req->newptr)
                      return (error);
              i = i ? arg2 : 0;
              if (arg1 != NULL)
                      i ^= arg2;
              /*
               * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
               * for writing.
               */
              mtx_lock(&prison0.pr_mtx);
              jail_default_allow = (jail_default_allow & ~arg2) | i;
              mtx_unlock(&prison0.pr_mtx);
              return (0);
      }
      
      SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
          NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
          "Processes in jail can set their hostnames (deprecated)");
      SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
          (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
          "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
      SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
          NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
          "Processes in jail can use System V IPC primitives (deprecated)");
      SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
          NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
          "Prison root can create raw sockets (deprecated)");
      SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
          NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
          "Processes in jail can alter system file flags (deprecated)");
      SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
          NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
          "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
      
      static int
      sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
      {
              struct prison *pr;
              int level, error;
      
              pr = req->td->td_ucred->cr_prison;
              level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
              error = sysctl_handle_int(oidp, &level, 0, req);
              if (error || !req->newptr)
                      return (error);
              *(int *)arg1 = level;
              return (0);
      }
      
      SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
          &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
          sysctl_jail_default_level, "I",
          "Processes in jail cannot see all mounted file systems (deprecated)");
      
      SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
          CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
          &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
          sysctl_jail_default_level, "I",
          "Ruleset for the devfs filesystem in jail (deprecated)");
      
      /*
       * Nodes to describe jail parameters.  Maximum length of string parameters
       * is returned in the string itself, and the other parameters exist merely
       * to make themselves and their types known.
       */
      SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
          "Jail parameters");
      
      int
      sysctl_jail_param(SYSCTL_HANDLER_ARGS)
      {
              int i;
              long l;
              size_t s;
              char numbuf[12];
      
              switch (oidp->oid_kind & CTLTYPE)
              {
              case CTLTYPE_LONG:
              case CTLTYPE_ULONG:
                      l = 0;
      #ifdef SCTL_MASK32
                      if (!(req->flags & SCTL_MASK32))
      #endif
                              return (SYSCTL_OUT(req, &l, sizeof(l)));
              case CTLTYPE_INT:
              case CTLTYPE_UINT:
                      i = 0;
                      return (SYSCTL_OUT(req, &i, sizeof(i)));
              case CTLTYPE_STRING:
                      snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
                      return
                          (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
              case CTLTYPE_STRUCT:
                      s = (size_t)arg2;
                      return (SYSCTL_OUT(req, &s, sizeof(s)));
              }
              return (0);
      }
      
      /*
       * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
       * jail creation time but cannot be changed in an existing jail.
       */
      SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
      SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
      SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
      SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
      SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
          "I", "Jail secure level");
      SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 
          "Jail value for kern.osreldate and uname -K");
      SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 
          "Jail value for kern.osrelease and uname -r");
      SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
          "I", "Jail cannot see all mounted file systems");
      SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
          "I", "Ruleset for in-jail devfs mounts");
      SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail persistence");
      #ifdef VIMAGE
      SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
          "E,jailsys", "Virtual network stack");
      #endif
      SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
          "B", "Jail is in the process of shutting down");
      
      SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
      SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
          "I", "Current number of child jails");
      SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
          "I", "Maximum number of child jails");
      
      SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
      SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
          "Jail hostname");
      SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
          "Jail NIS domainname");
      SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
          "Jail host UUID");
      SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
          "LU", "Jail host ID");
      
      SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
      SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
      
      #ifdef INET
      SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
          "Jail IPv4 address virtualization");
      SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
          "S,in_addr,a", "Jail IPv4 addresses");
      SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Do (not) use IPv4 source address selection rather than the "
          "primary jail IPv4 address.");
      #endif
      #ifdef INET6
      SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
          "Jail IPv6 address virtualization");
      SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
          "S,in6_addr,a", "Jail IPv6 addresses");
      SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Do (not) use IPv6 source address selection rather than the "
          "primary jail IPv6 address.");
      #endif
      
      SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
      SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may set hostname");
      SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may use SYSV IPC");
      SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may create raw sockets");
      SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may alter system file flags");
      SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may set file quotas");
      SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
      SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may lock (unlock) physical pages in memory");
      SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may bind sockets to reserved ports");
      SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may read the kernel message buffer");
      SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
          "B", "Unprivileged processes may use process debugging facilities");
      
      SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
      SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
          "B", "Jail may mount/unmount jail-friendly file systems in general");
      
      /*
       * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
       * its associated bit in the pr_allow bitmask, or zero if the parameter was
       * not created.
       */
      unsigned
      prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
          const char *descr)
      {
              struct bool_flags *bf;
              struct sysctl_oid *parent;
              char *allow_name, *allow_noname, *allowed;
      #ifndef NO_SYSCTL_DESCR
              char *descr_deprecated;
      #endif
              unsigned allow_flag;
      
              if (prefix
                  ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
                      < 0 ||
                    asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
                      < 0
                  : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
                    asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
                      free(allow_name, M_PRISON);
                      return 0;
              }
      
              /*
               * See if this parameter has already beed added, i.e. a module was
               * previously loaded/unloaded.
               */
              mtx_lock(&prison0.pr_mtx);
              for (bf = pr_flag_allow;
                   bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
                   bf++) {
                      if (strcmp(bf->name, allow_name) == 0) {
                              allow_flag = bf->flag;
                              goto no_add;
                      }
              }
      
              /*
               * Find a free bit in prison0's pr_allow, failing if there are none
               * (which shouldn't happen as long as we keep track of how many
               * potential dynamic flags exist).
               *
               * Due to per-jail unprivileged process debugging support
               * using pr_allow, also verify against PR_ALLOW_ALL_STATIC.
               * prison0 may have unprivileged process debugging unset.
               */
              for (allow_flag = 1;; allow_flag <<= 1) {
                      if (allow_flag == 0)
                              goto no_add;
                      if (allow_flag & PR_ALLOW_ALL_STATIC)
                              continue;
                      if ((prison0.pr_allow & allow_flag) == 0)
                              break;
              }
      
              /*
               * Note the parameter in the next open slot in pr_flag_allow.
               * Set the flag last so code that checks pr_flag_allow can do so
               * without locking.
               */
              for (bf = pr_flag_allow; bf->flag != 0; bf++)
                      if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
                              /* This should never happen, but is not fatal. */
                              allow_flag = 0;
                              goto no_add;
                      }
              prison0.pr_allow |= allow_flag;
              bf->name = allow_name;
              bf->noname = allow_noname;
              bf->flag = allow_flag;
              mtx_unlock(&prison0.pr_mtx);
      
              /*
               * Create sysctls for the paramter, and the back-compat global
               * permission.
               */
              parent = prefix
                  ? SYSCTL_ADD_NODE(NULL,
                        SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
                        OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
                  : &sysctl___security_jail_param_allow;
              (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
                  name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
                  NULL, 0, sysctl_jail_param, "B", descr);
              if ((prefix
                   ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
                   : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
      #ifndef NO_SYSCTL_DESCR
                      (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
                          descr);
      #endif
                      (void)SYSCTL_ADD_PROC(NULL,
                          SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
                          CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
                          sysctl_jail_default_allow, "I", descr_deprecated);
      #ifndef NO_SYSCTL_DESCR
                      free(descr_deprecated, M_TEMP);
      #endif
                      free(allowed, M_TEMP);
              }
              return allow_flag;
      
       no_add:
              mtx_unlock(&prison0.pr_mtx);
              free(allow_name, M_PRISON);
              free(allow_noname, M_PRISON);
              return allow_flag;
      }
      
      /*
       * The VFS system will register jail-aware filesystems here.  They each get
       * a parameter allow.mount.xxxfs and a flag to check when a jailed user
       * attempts to mount.
       */
      void
      prison_add_vfs(struct vfsconf *vfsp)
      {
      #ifdef NO_SYSCTL_DESCR
      
              vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
                  NULL, NULL);
      #else
              char *descr;
      
              (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
                  vfsp->vfc_name);
              vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
                  NULL, descr);
              free(descr, M_TEMP);
      #endif
      }
      
      #ifdef RACCT
      void
      prison_racct_foreach(void (*callback)(struct racct *racct,
          void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
          void *arg2, void *arg3)
      {
              struct prison_racct *prr;
      
              ASSERT_RACCT_ENABLED();
      
              sx_slock(&allprison_lock);
              if (pre != NULL)
                      (pre)();
              LIST_FOREACH(prr, &allprison_racct, prr_next)
                      (callback)(prr->prr_racct, arg2, arg3);
              if (post != NULL)
                      (post)();
              sx_sunlock(&allprison_lock);
      }
      
      static struct prison_racct *
      prison_racct_find_locked(const char *name)
      {
              struct prison_racct *prr;
      
              ASSERT_RACCT_ENABLED();
              sx_assert(&allprison_lock, SA_XLOCKED);
      
              if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
                      return (NULL);
      
              LIST_FOREACH(prr, &allprison_racct, prr_next) {
                      if (strcmp(name, prr->prr_name) != 0)
                              continue;
      
                      /* Found prison_racct with a matching name? */
                      prison_racct_hold(prr);
                      return (prr);
              }
      
              /* Add new prison_racct. */
              prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
              racct_create(&prr->prr_racct);
      
              strcpy(prr->prr_name, name);
              refcount_init(&prr->prr_refcount, 1);
              LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
      
              return (prr);
      }
      
      struct prison_racct *
      prison_racct_find(const char *name)
      {
              struct prison_racct *prr;
      
              ASSERT_RACCT_ENABLED();
      
              sx_xlock(&allprison_lock);
              prr = prison_racct_find_locked(name);
              sx_xunlock(&allprison_lock);
              return (prr);
      }
      
      void
      prison_racct_hold(struct prison_racct *prr)
      {
      
              ASSERT_RACCT_ENABLED();
      
              refcount_acquire(&prr->prr_refcount);
      }
      
      static void
      prison_racct_free_locked(struct prison_racct *prr)
      {
      
              ASSERT_RACCT_ENABLED();
              sx_assert(&allprison_lock, SA_XLOCKED);
      
              if (refcount_release(&prr->prr_refcount)) {
                      racct_destroy(&prr->prr_racct);
                      LIST_REMOVE(prr, prr_next);
                      free(prr, M_PRISON_RACCT);
              }
      }
      
      void
      prison_racct_free(struct prison_racct *prr)
      {
      
              ASSERT_RACCT_ENABLED();
              sx_assert(&allprison_lock, SA_UNLOCKED);
      
              if (refcount_release_if_not_last(&prr->prr_refcount))
                      return;
      
              sx_xlock(&allprison_lock);
              prison_racct_free_locked(prr);
              sx_xunlock(&allprison_lock);
      }
      
      static void
      prison_racct_attach(struct prison *pr)
      {
              struct prison_racct *prr;
      
              ASSERT_RACCT_ENABLED();
              sx_assert(&allprison_lock, SA_XLOCKED);
      
              prr = prison_racct_find_locked(pr->pr_name);
              KASSERT(prr != NULL, ("cannot find prison_racct"));
      
              pr->pr_prison_racct = prr;
      }
      
      /*
       * Handle jail renaming.  From the racct point of view, renaming means
       * moving from one prison_racct to another.
       */
      static void
      prison_racct_modify(struct prison *pr)
      {
      #ifdef RCTL
              struct proc *p;
              struct ucred *cred;
      #endif
              struct prison_racct *oldprr;
      
              ASSERT_RACCT_ENABLED();
      
              sx_slock(&allproc_lock);
              sx_xlock(&allprison_lock);
      
              if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
                      sx_xunlock(&allprison_lock);
                      sx_sunlock(&allproc_lock);
                      return;
              }
      
              oldprr = pr->pr_prison_racct;
              pr->pr_prison_racct = NULL;
      
              prison_racct_attach(pr);
      
              /*
               * Move resource utilisation records.
               */
              racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
      
      #ifdef RCTL
              /*
               * Force rctl to reattach rules to processes.
               */
              FOREACH_PROC_IN_SYSTEM(p) {
                      PROC_LOCK(p);
                      cred = crhold(p->p_ucred);
                      PROC_UNLOCK(p);
                      rctl_proc_ucred_changed(p, cred);
                      crfree(cred);
              }
      #endif
      
              sx_sunlock(&allproc_lock);
              prison_racct_free_locked(oldprr);
              sx_xunlock(&allprison_lock);
      }
      
      static void
      prison_racct_detach(struct prison *pr)
      {
      
              ASSERT_RACCT_ENABLED();
              sx_assert(&allprison_lock, SA_UNLOCKED);
      
              if (pr->pr_prison_racct == NULL)
                      return;
              prison_racct_free(pr->pr_prison_racct);
              pr->pr_prison_racct = NULL;
      }
      #endif /* RACCT */
      
      #ifdef DDB
      
      static void
      db_show_prison(struct prison *pr)
      {
              struct bool_flags *bf;
              struct jailsys_flags *jsf;
      #if defined(INET) || defined(INET6)
              int ii;
      #endif
              unsigned f;
      #ifdef INET
              char ip4buf[INET_ADDRSTRLEN];
      #endif
      #ifdef INET6
              char ip6buf[INET6_ADDRSTRLEN];
      #endif
      
              db_printf("prison %p:\n", pr);
              db_printf(" jid             = %d\n", pr->pr_id);
              db_printf(" name            = %s\n", pr->pr_name);
              db_printf(" parent          = %p\n", pr->pr_parent);
              db_printf(" ref             = %d\n", pr->pr_ref);
              db_printf(" uref            = %d\n", pr->pr_uref);
              db_printf(" path            = %s\n", pr->pr_path);
              db_printf(" cpuset          = %d\n", pr->pr_cpuset
                  ? pr->pr_cpuset->cs_id : -1);
      #ifdef VIMAGE
              db_printf(" vnet            = %p\n", pr->pr_vnet);
      #endif
              db_printf(" root            = %p\n", pr->pr_root);
              db_printf(" securelevel     = %d\n", pr->pr_securelevel);
              db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
              db_printf(" children.max    = %d\n", pr->pr_childmax);
              db_printf(" children.cur    = %d\n", pr->pr_childcount);
              db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
              db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
              db_printf(" flags           = 0x%x", pr->pr_flags);
              for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
                      if (pr->pr_flags & bf->flag)
                              db_printf(" %s", bf->name);
              for (jsf = pr_flag_jailsys;
                   jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
                   jsf++) {
                      f = pr->pr_flags & (jsf->disable | jsf->new);
                      db_printf(" %-16s= %s\n", jsf->name,
                          (f != 0 && f == jsf->disable) ? "disable"
                          : (f == jsf->new) ? "new"
                          : "inherit");
              }
              db_printf(" allow           = 0x%x", pr->pr_allow);
              for (bf = pr_flag_allow;
                   bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
                   bf++)
                      if (pr->pr_allow & bf->flag)
                              db_printf(" %s", bf->name);
              db_printf("\n");
              db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
              db_printf(" host.hostname   = %s\n", pr->pr_hostname);
              db_printf(" host.domainname = %s\n", pr->pr_domainname);
              db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
              db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
      #ifdef INET
              db_printf(" ip4s            = %d\n", pr->pr_ip4s);
              for (ii = 0; ii < pr->pr_ip4s; ii++)
                      db_printf(" %s %s\n",
                          ii == 0 ? "ip4.addr        =" : "                 ",
                          inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
      #endif
      #ifdef INET6
              db_printf(" ip6s            = %d\n", pr->pr_ip6s);
              for (ii = 0; ii < pr->pr_ip6s; ii++)
                      db_printf(" %s %s\n",
                          ii == 0 ? "ip6.addr        =" : "                 ",
                          ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
      #endif
      }
      
      DB_SHOW_COMMAND(prison, db_show_prison_command)
      {
              struct prison *pr;
      
              if (!have_addr) {
                      /*
                       * Show all prisons in the list, and prison0 which is not
                       * listed.
                       */
                      db_show_prison(&prison0);
                      if (!db_pager_quit) {
                              TAILQ_FOREACH(pr, &allprison, pr_list) {
                                      db_show_prison(pr);
                                      if (db_pager_quit)
                                              break;
                              }
                      }
                      return;
              }
      
              if (addr == 0)
                      pr = &prison0;
              else {
                      /* Look for a prison with the ID and with references. */
                      TAILQ_FOREACH(pr, &allprison, pr_list)
                              if (pr->pr_id == addr && pr->pr_ref > 0)
                                      break;
                      if (pr == NULL)
                              /* Look again, without requiring a reference. */
                              TAILQ_FOREACH(pr, &allprison, pr_list)
                                      if (pr->pr_id == addr)
                                              break;
                      if (pr == NULL)
                              /* Assume address points to a valid prison. */
                              pr = (struct prison *)addr;
              }
              db_show_prison(pr);
      }
      
      #endif /* DDB */
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2013 EMC Corp.
       * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
       * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       */
      
      /*
       * Path-compressed radix trie implementation.
       * The following code is not generalized into a general purpose library
       * because there are way too many parameters embedded that should really
       * be decided by the library consumers.  At the same time, consumers
       * of this code must achieve highest possible performance.
       *
       * The implementation takes into account the following rationale:
       * - Size of the nodes should be as small as possible but still big enough
       *   to avoid a large maximum depth for the trie.  This is a balance
       *   between the necessity to not wire too much physical memory for the nodes
       *   and the necessity to avoid too much cache pollution during the trie
       *   operations.
       * - There is not a huge bias toward the number of lookup operations over
       *   the number of insert and remove operations.  This basically implies
       *   that optimizations supposedly helping one operation but hurting the
       *   other might be carefully evaluated.
       * - On average not many nodes are expected to be fully populated, hence
       *   level compression may just complicate things.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include "opt_ddb.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/proc.h>
      #include <sys/vmmeter.h>
      #include <sys/smr.h>
      #include <sys/smr_types.h>
      
      #include <vm/uma.h>
      #include <vm/vm.h>
      #include <vm/vm_param.h>
      #include <vm/vm_object.h>
      #include <vm/vm_page.h>
      #include <vm/vm_radix.h>
      
      #ifdef DDB
      #include <ddb/ddb.h>
      #endif
      
      /*
       * These widths should allow the pointers to a node's children to fit within
       * a single cache line.  The extra levels from a narrow width should not be
       * a problem thanks to path compression.
       */
      #ifdef __LP64__
      #define        VM_RADIX_WIDTH        4
      #else
      #define        VM_RADIX_WIDTH        3
      #endif
      
      #define        VM_RADIX_COUNT        (1 << VM_RADIX_WIDTH)
      #define        VM_RADIX_MASK        (VM_RADIX_COUNT - 1)
      #define        VM_RADIX_LIMIT                                                        \
              (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
      
      /* Flag bits stored in node pointers. */
      #define        VM_RADIX_ISLEAF        0x1
      #define        VM_RADIX_FLAGS        0x1
      #define        VM_RADIX_PAD        VM_RADIX_FLAGS
      
      /* Returns one unit associated with specified level. */
      #define        VM_RADIX_UNITLEVEL(lev)                                                \
              ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
      
      enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
      
      struct vm_radix_node;
      typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
      
      struct vm_radix_node {
              vm_pindex_t        rn_owner;                        /* Owner of record. */
              uint16_t        rn_count;                        /* Valid children. */
              uint8_t                rn_clev;                        /* Current level. */
              int8_t                rn_last;                        /* zero last ptr. */
              smrnode_t        rn_child[VM_RADIX_COUNT];        /* Child nodes. */
      };
      
      static uma_zone_t vm_radix_node_zone;
      static smr_t vm_radix_smr;
      
      static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
          enum vm_radix_access access);
      
      /*
       * Allocate a radix node.
       */
      static struct vm_radix_node *
      vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
      {
              struct vm_radix_node *rnode;
      
              rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
              if (rnode == NULL)
                      return (NULL);
      
              /*
               * We want to clear the last child pointer after the final section
               * has exited so lookup can not return false negatives.  It is done
               * here because it will be cache-cold in the dtor callback.
               */
 2880         if (rnode->rn_last != 0) {
                      vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
                          NULL, UNSERIALIZED);
                      rnode->rn_last = 0;
              }
              rnode->rn_owner = owner;
              rnode->rn_count = count;
              rnode->rn_clev = clevel;
              return (rnode);
      }
      
      /*
       * Free radix node.
       */
      static __inline void
      vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
 1070 {
      #ifdef INVARIANTS
              int slot;
      
              KASSERT(rnode->rn_count == 0,
                  ("vm_radix_node_put: rnode %p has %d children", rnode,
                  rnode->rn_count));
              for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
 1070                 if (slot == last)
                              continue;
 1067                 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
                          NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
              }
      #endif
              /* Off by one so a freshly zero'd node is not assigned to. */
              rnode->rn_last = last + 1;
              uma_zfree_smr(vm_radix_node_zone, rnode);
      }
      
      /*
       * Return the position in the array for a given level.
       */
      static __inline int
      vm_radix_slot(vm_pindex_t index, uint16_t level)
      {
      
              return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
      }
      
      /* Trims the key after the specified level. */
      static __inline vm_pindex_t
      vm_radix_trimkey(vm_pindex_t index, uint16_t level)
      {
              vm_pindex_t ret;
      
              ret = index;
              if (level > 0) {
                      ret >>= level * VM_RADIX_WIDTH;
                      ret <<= level * VM_RADIX_WIDTH;
              }
              return (ret);
      }
      
      /*
       * Fetch a node pointer from a slot in another node.
       */
      static __inline struct vm_radix_node *
      vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
      {
      
              switch (access) {
              case UNSERIALIZED:
                      return (smr_unserialized_load(p, true));
              case LOCKED:
                      return (smr_serialized_load(p, true));
              case SMR:
                      return (smr_entered_load(p, vm_radix_smr));
              }
              __unreachable();
      }
      
      static __inline void
      vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
          enum vm_radix_access access)
      {
      
      
              switch (access) {
              case UNSERIALIZED:
                      smr_unserialized_store(p, v, true);
                      break;
              case LOCKED:
                      smr_serialized_store(p, v, true);
                      break;
              case SMR:
                      panic("vm_radix_node_store: Not supported in smr section.");
              }
      }
      
      /*
       * Get the root node for a radix tree.
       */
      static __inline struct vm_radix_node *
      vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
      {
      
              return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
      }
      
      /*
       * Set the root node for a radix tree.
       */
      static __inline void
      vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
          enum vm_radix_access access)
      {
      
              vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
      }
      
      /*
       * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
       */
      static __inline boolean_t
      vm_radix_isleaf(struct vm_radix_node *rnode)
      {
      
              return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
      }
      
      /*
       * Returns the associated page extracted from rnode.
       */
      static __inline vm_page_t
      vm_radix_topage(struct vm_radix_node *rnode)
      {
      
 3133         return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
      }
      
      /*
       * Adds the page as a child of the provided node.
       */
      static __inline void
      vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
          vm_page_t page, enum vm_radix_access access)
      {
              int slot;
      
              slot = vm_radix_slot(index, clev);
              vm_radix_node_store(&rnode->rn_child[slot],
                  (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access);
      }
      
      /*
       * Returns the slot where two keys differ.
       * It cannot accept 2 equal keys.
       */
      static __inline uint16_t
      vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
      {
              uint16_t clev;
      
              KASSERT(index1 != index2, ("%s: passing the same key value %jx",
                  __func__, (uintmax_t)index1));
      
              index1 ^= index2;
              for (clev = VM_RADIX_LIMIT;; clev--)
 2880                 if (vm_radix_slot(index1, clev) != 0)
                              return (clev);
      }
      
      /*
       * Returns TRUE if it can be determined that key does not belong to the
       * specified rnode.  Otherwise, returns FALSE.
       */
      static __inline boolean_t
      vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
      {
      
              if (rnode->rn_clev < VM_RADIX_LIMIT) {
 3283                 idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
                      return (idx != rnode->rn_owner);
              }
              return (FALSE);
      }
      
      /*
       * Internal helper for vm_radix_reclaim_allnodes().
       * This function is recursive.
       */
      static void
      vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
    3 {
              struct vm_radix_node *child;
              int slot;
      
              KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
                  ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
    3         for (slot = 0; rnode->rn_count != 0; slot++) {
                      child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
                      if (child == NULL)
                              continue;
    3                 if (!vm_radix_isleaf(child))
    1                         vm_radix_reclaim_allnodes_int(child);
                      vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
                      rnode->rn_count--;
              }
              vm_radix_node_put(rnode, -1);
      }
      
      #ifndef UMA_MD_SMALL_ALLOC
      void vm_radix_reserve_kva(void);
      /*
       * Reserve the KVA necessary to satisfy the node allocation.
       * This is mandatory in architectures not supporting direct
       * mapping as they will need otherwise to carve into the kernel maps for
       * every node allocation, resulting into deadlocks for consumers already
       * working with kernel maps.
       */
      void
      vm_radix_reserve_kva(void)
      {
      
              /*
               * Calculate the number of reserved nodes, discounting the pages that
               * are needed to store them.
               */
              if (!uma_zone_reserve_kva(vm_radix_node_zone,
                  ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
                  sizeof(struct vm_radix_node))))
                      panic("%s: unable to reserve KVA", __func__);
      }
      #endif
      
      /*
       * Initialize the UMA slab zone.
       */
      void
      vm_radix_zinit(void)
      {
      
              vm_radix_node_zone = uma_zcreate("RADIX NODE",
                  sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
                  VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
              vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
      }
      
      /*
       * Inserts the key-value pair into the trie.
       * Panics if the key already exists.
       */
      int
      vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
 3009 {
              vm_pindex_t index, newind;
              struct vm_radix_node *rnode, *tmp;
              smrnode_t *parentp;
              vm_page_t m;
              int slot;
              uint16_t clev;
      
              index = page->pindex;
      
              /*
               * The owner of record for root is not really important because it
               * will never be used.
               */
              rnode = vm_radix_root_load(rtree, LOCKED);
              if (rnode == NULL) {
 1268                 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
                      return (0);
              }
              parentp = (smrnode_t *)&rtree->rt_root;
              for (;;) {
                      if (vm_radix_isleaf(rnode)) {
                              m = vm_radix_topage(rnode);
                              if (m->pindex == index)
                                      panic("%s: key %jx is already present",
                                          __func__, (uintmax_t)index);
 2876                         clev = vm_radix_keydiff(m->pindex, index);
                              tmp = vm_radix_node_get(vm_radix_trimkey(index,
                                  clev + 1), 2, clev);
                              if (tmp == NULL)
                                      return (ENOMEM);
                              /* These writes are not yet visible due to ordering. */
                              vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
                              vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
                              /* Synchronize to make leaf visible. */
                              vm_radix_node_store(parentp, tmp, LOCKED);
                              return (0);
 2783                 } else if (vm_radix_keybarr(rnode, index))
                              break;
                      slot = vm_radix_slot(index, rnode->rn_clev);
                      parentp = &rnode->rn_child[slot];
                      tmp = vm_radix_node_load(parentp, LOCKED);
 1419                 if (tmp == NULL) {
                              rnode->rn_count++;
                              vm_radix_addpage(rnode, index, rnode->rn_clev, page,
                                  LOCKED);
                              return (0);
                      }
                      rnode = tmp;
              }
      
              /*
               * A new node is needed because the right insertion level is reached.
               * Setup the new intermediate node and add the 2 children: the
               * new object and the older edge.
               */
              newind = rnode->rn_owner;
              clev = vm_radix_keydiff(newind, index);
              tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
              if (tmp == NULL)
                      return (ENOMEM);
              slot = vm_radix_slot(newind, clev);
              /* These writes are not yet visible due to ordering. */
              vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
              vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
              /* Serializing write to make the above visible. */
              vm_radix_node_store(parentp, tmp, LOCKED);
      
              return (0);
      }
      
      /*
       * Returns TRUE if the specified radix tree contains a single leaf and FALSE
       * otherwise.
       */
      boolean_t
      vm_radix_is_singleton(struct vm_radix *rtree)
      {
              struct vm_radix_node *rnode;
      
              rnode = vm_radix_root_load(rtree, LOCKED);
              if (rnode == NULL)
                      return (FALSE);
              return (vm_radix_isleaf(rnode));
      }
      
      /*
       * Returns the value stored at the index.  If the index is not present,
       * NULL is returned.
       */
      static __always_inline vm_page_t
      _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
          enum vm_radix_access access)
      {
              struct vm_radix_node *rnode;
              vm_page_t m;
              int slot;
      
              rnode = vm_radix_root_load(rtree, access);
 2918         while (rnode != NULL) {
                      if (vm_radix_isleaf(rnode)) {
                              m = vm_radix_topage(rnode);
 2894                         if (m->pindex == index)
                                      return (m);
                              break;
                      }
 2793                 if (vm_radix_keybarr(rnode, index))
                              break;
                      slot = vm_radix_slot(index, rnode->rn_clev);
                      rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
              }
              return (NULL);
      }
      
      /*
       * Returns the value stored at the index assuming there is an external lock.
       *
       * If the index is not present, NULL is returned.
       */
      vm_page_t
      vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
 3002 {
      
 3002         return _vm_radix_lookup(rtree, index, LOCKED);
      }
      
      /*
       * Returns the value stored at the index without requiring an external lock.
       *
       * If the index is not present, NULL is returned.
       */
      vm_page_t
      vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
 1082 {
              vm_page_t m;
      
              smr_enter(vm_radix_smr);
              m = _vm_radix_lookup(rtree, index, SMR);
              smr_exit(vm_radix_smr);
      
              return (m);
      }
      
      /*
       * Look up the nearest entry at a position greater than or equal to index.
       */
      vm_page_t
      vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
 1059 {
              struct vm_radix_node *stack[VM_RADIX_LIMIT];
              vm_pindex_t inc;
              vm_page_t m;
              struct vm_radix_node *child, *rnode;
      #ifdef INVARIANTS
              int loops = 0;
      #endif
              int slot, tos;
      
              rnode = vm_radix_root_load(rtree, LOCKED);
              if (rnode == NULL)
                      return (NULL);
              else if (vm_radix_isleaf(rnode)) {
  199                 m = vm_radix_topage(rnode);
                      if (m->pindex >= index)
                              return (m);
                      else
                              return (NULL);
              }
              tos = 0;
              for (;;) {
                      /*
                       * If the keys differ before the current bisection node,
                       * then the search key might rollback to the earliest
                       * available bisection node or to the smallest key
                       * in the current node (if the owner is greater than the
                       * search key).
                       */
  875                 if (vm_radix_keybarr(rnode, index)) {
  106                         if (index > rnode->rn_owner) {
      ascend:
                                      KASSERT(++loops < 1000,
                                          ("vm_radix_lookup_ge: too many loops"));
      
                                      /*
                                       * Pop nodes from the stack until either the
                                       * stack is empty or a node that could have a
                                       * matching descendant is found.
                                       */
                                      do {
  512                                         if (tos == 0)
                                                      return (NULL);
                                              rnode = stack[--tos];
                                      } while (vm_radix_slot(index,
                                          rnode->rn_clev) == (VM_RADIX_COUNT - 1));
      
                                      /*
                                       * The following computation cannot overflow
                                       * because index's slot at the current level
                                       * is less than VM_RADIX_COUNT - 1.
                                       */
                                      index = vm_radix_trimkey(index,
                                          rnode->rn_clev);
                                      index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
                              } else
                                      index = rnode->rn_owner;
  483                         KASSERT(!vm_radix_keybarr(rnode, index),
                                  ("vm_radix_lookup_ge: keybarr failed"));
                      }
                      slot = vm_radix_slot(index, rnode->rn_clev);
                      child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
                      if (vm_radix_isleaf(child)) {
                              m = vm_radix_topage(child);
    4                         if (m->pindex >= index)
                                      return (m);
  808                 } else if (child != NULL)
                              goto descend;
      
                      /*
                       * Look for an available edge or page within the current
                       * bisection node.
                       */
   99                 if (slot < (VM_RADIX_COUNT - 1)) {
                              inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
                              index = vm_radix_trimkey(index, rnode->rn_clev);
                              do {
                                      index += inc;
                                      slot++;
                                      child = vm_radix_node_load(&rnode->rn_child[slot],
                                          LOCKED);
                                      if (vm_radix_isleaf(child)) {
                                              m = vm_radix_topage(child);
                                              if (m->pindex >= index)
                                                      return (m);
  753                                 } else if (child != NULL)
                                              goto descend;
  752                         } while (slot < (VM_RADIX_COUNT - 1));
                      }
  721                 KASSERT(child == NULL || vm_radix_isleaf(child),
                          ("vm_radix_lookup_ge: child is radix node"));
      
                      /*
                       * If a page or edge greater than the search slot is not found
                       * in the current node, ascend to the next higher-level node.
                       */
                      goto ascend;
      descend:
                      KASSERT(rnode->rn_clev > 0,
                          ("vm_radix_lookup_ge: pushing leaf's parent"));
  569                 KASSERT(tos < VM_RADIX_LIMIT,
                          ("vm_radix_lookup_ge: stack overflow"));
                      stack[tos++] = rnode;
                      rnode = child;
              }
      }
      
      /*
       * Look up the nearest entry at a position less than or equal to index.
       */
      vm_page_t
      vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
 3004 {
              struct vm_radix_node *stack[VM_RADIX_LIMIT];
              vm_pindex_t inc;
              vm_page_t m;
              struct vm_radix_node *child, *rnode;
      #ifdef INVARIANTS
              int loops = 0;
      #endif
              int slot, tos;
      
              rnode = vm_radix_root_load(rtree, LOCKED);
 1264         if (rnode == NULL)
                      return (NULL);
              else if (vm_radix_isleaf(rnode)) {
 2506                 m = vm_radix_topage(rnode);
                      if (m->pindex <= index)
                              return (m);
                      else
                              return (NULL);
              }
              tos = 0;
              for (;;) {
                      /*
                       * If the keys differ before the current bisection node,
                       * then the search key might rollback to the earliest
                       * available bisection node or to the largest key
                       * in the current node (if the owner is smaller than the
                       * search key).
                       */
 2778                 if (vm_radix_keybarr(rnode, index)) {
   45                         if (index > rnode->rn_owner) {
                                      index = rnode->rn_owner + VM_RADIX_COUNT *
 1649                                     VM_RADIX_UNITLEVEL(rnode->rn_clev);
                              } else {
      ascend:
                                      KASSERT(++loops < 1000,
                                          ("vm_radix_lookup_le: too many loops"));
      
                                      /*
                                       * Pop nodes from the stack until either the
                                       * stack is empty or a node that could have a
                                       * matching descendant is found.
                                       */
                                      do {
   33                                         if (tos == 0)
                                                      return (NULL);
                                              rnode = stack[--tos];
   21                                 } while (vm_radix_slot(index,
                                          rnode->rn_clev) == 0);
      
                                      /*
                                       * The following computation cannot overflow
                                       * because index's slot at the current level
                                       * is greater than 0.
                                       */
                                      index = vm_radix_trimkey(index,
                                          rnode->rn_clev);
                              }
                              index--;
 1652                         KASSERT(!vm_radix_keybarr(rnode, index),
                                  ("vm_radix_lookup_le: keybarr failed"));
                      }
                      slot = vm_radix_slot(index, rnode->rn_clev);
                      child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
                      if (vm_radix_isleaf(child)) {
                              m = vm_radix_topage(child);
    4                         if (m->pindex <= index)
                                      return (m);
 2771                 } else if (child != NULL)
                              goto descend;
      
                      /*
                       * Look for an available edge or page within the current
                       * bisection node.
                       */
   34                 if (slot > 0) {
                              inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
                              index |= inc - 1;
                              do {
                                      index -= inc;
                                      slot--;
                                      child = vm_radix_node_load(&rnode->rn_child[slot],
                                          LOCKED);
                                      if (vm_radix_isleaf(child)) {
                                              m = vm_radix_topage(child);
                                              if (m->pindex <= index)
                                                      return (m);
 1209                                 } else if (child != NULL)
                                              goto descend;
 1125                         } while (slot > 0);
                      }
   47                 KASSERT(child == NULL || vm_radix_isleaf(child),
                          ("vm_radix_lookup_le: child is radix node"));
      
                      /*
                       * If a page or edge smaller than the search slot is not found
                       * in the current node, ascend to the next higher-level node.
                       */
                      goto ascend;
      descend:
                      KASSERT(rnode->rn_clev > 0,
                          ("vm_radix_lookup_le: pushing leaf's parent"));
 1336                 KASSERT(tos < VM_RADIX_LIMIT,
                          ("vm_radix_lookup_le: stack overflow"));
                      stack[tos++] = rnode;
                      rnode = child;
              }
      }
      
      /*
       * Remove the specified index from the trie, and return the value stored at
       * that index.  If the index is not present, return NULL.
       */
      vm_page_t
      vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
 1095 {
              struct vm_radix_node *rnode, *parent, *tmp;
              vm_page_t m;
              int i, slot;
      
              rnode = vm_radix_root_load(rtree, LOCKED);
              if (vm_radix_isleaf(rnode)) {
                      m = vm_radix_topage(rnode);
    2                 if (m->pindex != index)
                              return (NULL);
 1040                 vm_radix_root_store(rtree, NULL, LOCKED);
                      return (m);
              }
              parent = NULL;
              for (;;) {
    4                 if (rnode == NULL)
                              return (NULL);
                      slot = vm_radix_slot(index, rnode->rn_clev);
                      tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
   34                 if (vm_radix_isleaf(tmp)) {
                              m = vm_radix_topage(tmp);
                              if (m->pindex != index)
                                      return (NULL);
                              vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED);
                              rnode->rn_count--;
 1063                         if (rnode->rn_count > 1)
                                      return (m);
                              for (i = 0; i < VM_RADIX_COUNT; i++)
 1067                                 if (vm_radix_node_load(&rnode->rn_child[i],
                                          LOCKED) != NULL)
                                              break;
                              KASSERT(i != VM_RADIX_COUNT,
                                  ("%s: invalid node configuration", __func__));
                              tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED);
                              if (parent == NULL)
 1057                                 vm_radix_root_store(rtree, tmp, LOCKED);
                              else {
                                      slot = vm_radix_slot(index, parent->rn_clev);
                                      KASSERT(vm_radix_node_load(
                                          &parent->rn_child[slot], LOCKED) == rnode,
                                          ("%s: invalid child value", __func__));
   20                                 vm_radix_node_store(&parent->rn_child[slot],
                                          tmp, LOCKED);
                              }
                              /*
                               * The child is still valid and we can not zero the
                               * pointer until all smr references are gone.
                               */
                              rnode->rn_count--;
                              vm_radix_node_put(rnode, i);
                              return (m);
                      }
                      parent = rnode;
                      rnode = tmp;
              }
      }
      
      /*
       * Remove and free all the nodes from the radix tree.
       * This function is recursive but there is a tight control on it as the
       * maximum depth of the tree is fixed.
       */
      void
      vm_radix_reclaim_allnodes(struct vm_radix *rtree)
    4 {
              struct vm_radix_node *root;
      
              root = vm_radix_root_load(rtree, LOCKED);
              if (root == NULL)
                      return;
              vm_radix_root_store(rtree, NULL, UNSERIALIZED);
    1         if (!vm_radix_isleaf(root))
    3                 vm_radix_reclaim_allnodes_int(root);
      }
      
      /*
       * Replace an existing page in the trie with another one.
       * Panics if there is not an old page in the trie at the new page's index.
       */
      vm_page_t
      vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
      {
              struct vm_radix_node *rnode, *tmp;
              vm_page_t m;
              vm_pindex_t index;
              int slot;
      
              index = newpage->pindex;
              rnode = vm_radix_root_load(rtree, LOCKED);
              if (rnode == NULL)
                      panic("%s: replacing page on an empty trie", __func__);
              if (vm_radix_isleaf(rnode)) {
                      m = vm_radix_topage(rnode);
                      if (m->pindex != index)
                              panic("%s: original replacing root key not found",
                                  __func__);
                      rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
                      return (m);
              }
              for (;;) {
                      slot = vm_radix_slot(index, rnode->rn_clev);
                      tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
                      if (vm_radix_isleaf(tmp)) {
                              m = vm_radix_topage(tmp);
                              if (m->pindex == index) {
                                      vm_radix_node_store(&rnode->rn_child[slot],
                                          (struct vm_radix_node *)((uintptr_t)newpage |
                                          VM_RADIX_ISLEAF), LOCKED);
                                      return (m);
                              } else
                                      break;
                      } else if (tmp == NULL || vm_radix_keybarr(tmp, index))
                              break;
                      rnode = tmp;
              }
              panic("%s: original replacing page not found", __func__);
      }
      
      void
      vm_radix_wait(void)
      {
              uma_zwait(vm_radix_node_zone);
      }
      
      #ifdef DDB
      /*
       * Show details about the given radix node.
       */
      DB_SHOW_COMMAND(radixnode, db_show_radixnode)
      {
              struct vm_radix_node *rnode, *tmp;
              int i;
      
              if (!have_addr)
                      return;
              rnode = (struct vm_radix_node *)addr;
              db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
                  (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
                  rnode->rn_clev);
              for (i = 0; i < VM_RADIX_COUNT; i++) {
                      tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
                      if (tmp != NULL)
                              db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
                                  i, (void *)tmp,
                                  vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
                                  rnode->rn_clev);
              }
      }
      #endif /* DDB */
      /*-
       * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
       *
       * Copyright (c) 2004 Poul-Henning Kamp
       * Copyright (c) 1994,1997 John S. Dyson
       * Copyright (c) 2013 The FreeBSD Foundation
       * All rights reserved.
       *
       * Portions of this software were developed by Konstantin Belousov
       * under sponsorship from the FreeBSD Foundation.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      /*
       * this file contains a new buffer I/O scheme implementing a coherent
       * VM object and buffer cache scheme.  Pains have been taken to make
       * sure that the performance degradation associated with schemes such
       * as this is not realized.
       *
       * Author:  John S. Dyson
       * Significant help during the development and debugging phases
       * had been provided by David Greenman, also of the FreeBSD core team.
       *
       * see man buf(9) for more info.
       */
      
      #include <sys/cdefs.h>
      __FBSDID("$FreeBSD$");
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/bio.h>
      #include <sys/bitset.h>
      #include <sys/conf.h>
      #include <sys/counter.h>
      #include <sys/buf.h>
      #include <sys/devicestat.h>
      #include <sys/eventhandler.h>
      #include <sys/fail.h>
      #include <sys/ktr.h>
      #include <sys/limits.h>
      #include <sys/lock.h>
      #include <sys/malloc.h>
      #include <sys/mount.h>
      #include <sys/mutex.h>
      #include <sys/kernel.h>
      #include <sys/kthread.h>
      #include <sys/proc.h>
      #include <sys/racct.h>
      #include <sys/refcount.h>
      #include <sys/resourcevar.h>
      #include <sys/rwlock.h>
      #include <sys/smp.h>
      #include <sys/sysctl.h>
      #include <sys/syscallsubr.h>
      #include <sys/vmem.h>
      #include <sys/vmmeter.h>
      #include <sys/vnode.h>
      #include <sys/watchdog.h>
      #include <geom/geom.h>
      #include <vm/vm.h>
      #include <vm/vm_param.h>
      #include <vm/vm_kern.h>
      #include <vm/vm_object.h>
      #include <vm/vm_page.h>
      #include <vm/vm_pageout.h>
      #include <vm/vm_pager.h>
      #include <vm/vm_extern.h>
      #include <vm/vm_map.h>
      #include <vm/swap_pager.h>
      
      static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
      
      struct        bio_ops bioops;                /* I/O operation notification */
      
      struct        buf_ops buf_ops_bio = {
              .bop_name        =        "buf_ops_bio",
              .bop_write        =        bufwrite,
              .bop_strategy        =        bufstrategy,
              .bop_sync        =        bufsync,
              .bop_bdflush        =        bufbdflush,
      };
      
      struct bufqueue {
              struct mtx_padalign        bq_lock;
              TAILQ_HEAD(, buf)        bq_queue;
              uint8_t                        bq_index;
              uint16_t                bq_subqueue;
              int                        bq_len;
      } __aligned(CACHE_LINE_SIZE);
      
      #define        BQ_LOCKPTR(bq)                (&(bq)->bq_lock)
      #define        BQ_LOCK(bq)                mtx_lock(BQ_LOCKPTR((bq)))
      #define        BQ_UNLOCK(bq)                mtx_unlock(BQ_LOCKPTR((bq)))
      #define        BQ_ASSERT_LOCKED(bq)        mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
      
      struct bufdomain {
              struct bufqueue        bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
              struct bufqueue bd_dirtyq;
              struct bufqueue        *bd_cleanq;
              struct mtx_padalign bd_run_lock;
              /* Constants */
              long                bd_maxbufspace;
              long                bd_hibufspace;
              long                 bd_lobufspace;
              long                 bd_bufspacethresh;
              int                bd_hifreebuffers;
              int                bd_lofreebuffers;
              int                bd_hidirtybuffers;
              int                bd_lodirtybuffers;
              int                bd_dirtybufthresh;
              int                bd_lim;
              /* atomics */
              int                bd_wanted;
              int __aligned(CACHE_LINE_SIZE)        bd_numdirtybuffers;
              int __aligned(CACHE_LINE_SIZE)        bd_running;
              long __aligned(CACHE_LINE_SIZE) bd_bufspace;
              int __aligned(CACHE_LINE_SIZE)        bd_freebuffers;
      } __aligned(CACHE_LINE_SIZE);
      
      #define        BD_LOCKPTR(bd)                (&(bd)->bd_cleanq->bq_lock)
      #define        BD_LOCK(bd)                mtx_lock(BD_LOCKPTR((bd)))
      #define        BD_UNLOCK(bd)                mtx_unlock(BD_LOCKPTR((bd)))
      #define        BD_ASSERT_LOCKED(bd)        mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
      #define        BD_RUN_LOCKPTR(bd)        (&(bd)->bd_run_lock)
      #define        BD_RUN_LOCK(bd)                mtx_lock(BD_RUN_LOCKPTR((bd)))
      #define        BD_RUN_UNLOCK(bd)        mtx_unlock(BD_RUN_LOCKPTR((bd)))
      #define        BD_DOMAIN(bd)                (bd - bdomain)
      
      static struct buf *buf;                /* buffer header pool */
      extern struct buf *swbuf;        /* Swap buffer header pool. */
      caddr_t __read_mostly unmapped_buf;
      
      /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
      struct proc *bufdaemonproc;
      
      static int inmem(struct vnode *vp, daddr_t blkno);
      static void vm_hold_free_pages(struct buf *bp, int newbsize);
      static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
                      vm_offset_t to);
      static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
      static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
                      vm_page_t m);
      static void vfs_clean_pages_dirty_buf(struct buf *bp);
      static void vfs_setdirty_range(struct buf *bp);
      static void vfs_vmio_invalidate(struct buf *bp);
      static void vfs_vmio_truncate(struct buf *bp, int npages);
      static void vfs_vmio_extend(struct buf *bp, int npages, int size);
      static int vfs_bio_clcheck(struct vnode *vp, int size,
                      daddr_t lblkno, daddr_t blkno);
      static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
                      void (*)(struct buf *));
      static int buf_flush(struct vnode *vp, struct bufdomain *, int);
      static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
      static void buf_daemon(void);
      static __inline void bd_wakeup(void);
      static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
      static void bufkva_reclaim(vmem_t *, int);
      static void bufkva_free(struct buf *);
      static int buf_import(void *, void **, int, int, int);
      static void buf_release(void *, void **, int);
      static void maxbcachebuf_adjust(void);
      static inline struct bufdomain *bufdomain(struct buf *);
      static void bq_remove(struct bufqueue *bq, struct buf *bp);
      static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
      static int buf_recycle(struct bufdomain *, bool kva);
      static void bq_init(struct bufqueue *bq, int qindex, int cpu,
                  const char *lockname);
      static void bd_init(struct bufdomain *bd);
      static int bd_flushall(struct bufdomain *bd);
      static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
      static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);
      
      static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
      int vmiodirenable = TRUE;
      SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
          "Use the VM system for directory writes");
      long runningbufspace;
      SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
          "Amount of presently outstanding async buffer io");
      SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
          NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
      static counter_u64_t bufkvaspace;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
          "Kernel virtual memory used for buffers");
      static long maxbufspace;
      SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
          CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace,
          __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
          "Maximum allowed value of bufspace (including metadata)");
      static long bufmallocspace;
      SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
          "Amount of malloced memory for buffers");
      static long maxbufmallocspace;
      SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
          0, "Maximum amount of malloced memory for buffers");
      static long lobufspace;
      SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
          CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace,
          __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
          "Minimum amount of buffers we want to have");
      long hibufspace;
      SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
          CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace,
          __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
          "Maximum allowed value of bufspace (excluding metadata)");
      long bufspacethresh;
      SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
          CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh,
          __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L",
          "Bufspace consumed before waking the daemon to free some");
      static counter_u64_t buffreekvacnt;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
          "Number of times we have freed the KVA space from some buffer");
      static counter_u64_t bufdefragcnt;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
          "Number of times we have had to repeat buffer allocation to defragment");
      static long lorunningspace;
      SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
          CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
          "Minimum preferred space used for in-progress I/O");
      static long hirunningspace;
      SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
          CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
          "Maximum amount of space to use for in-progress I/O");
      int dirtybufferflushes;
      SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
          0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
      int bdwriteskip;
      SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
          0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
      int altbufferflushes;
      SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW | CTLFLAG_STATS,
          &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers");
      static int recursiveflushes;
      SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW | CTLFLAG_STATS,
          &recursiveflushes, 0, "Number of flushes skipped due to being recursive");
      static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
      SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
          CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
          "Number of buffers that are dirty (has unwritten changes) at the moment");
      static int lodirtybuffers;
      SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
          CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers,
          __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I",
          "How many buffers we want to have free before bufdaemon can sleep");
      static int hidirtybuffers;
      SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
          CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers,
          __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I",
          "When the number of dirty buffers is considered severe");
      int dirtybufthresh;
      SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
          CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh,
          __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I",
          "Number of bdwrite to bawrite conversions to clear dirty buffers");
      static int numfreebuffers;
      SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
          "Number of free buffers");
      static int lofreebuffers;
      SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
          CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers,
          __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I",
         "Target number of free buffers");
      static int hifreebuffers;
      SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
          CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers,
          __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I",
         "Threshold for clean buffer recycling");
      static counter_u64_t getnewbufcalls;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
         &getnewbufcalls, "Number of calls to getnewbuf");
      static counter_u64_t getnewbufrestarts;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
          &getnewbufrestarts,
          "Number of times getnewbuf has had to restart a buffer acquisition");
      static counter_u64_t mappingrestarts;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
          &mappingrestarts,
          "Number of times getblk has had to restart a buffer mapping for "
          "unmapped buffer");
      static counter_u64_t numbufallocfails;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
          &numbufallocfails, "Number of times buffer allocations failed");
      static int flushbufqtarget = 100;
      SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
          "Amount of work to do in flushbufqueues when helping bufdaemon");
      static counter_u64_t notbufdflushes;
      SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
          "Number of dirty buffer flushes done by the bufdaemon helpers");
      static long barrierwrites;
      SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW | CTLFLAG_STATS,
          &barrierwrites, 0, "Number of barrier writes");
      SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
          &unmapped_buf_allowed, 0,
          "Permit the use of the unmapped i/o");
      int maxbcachebuf = MAXBCACHEBUF;
      SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
          "Maximum size of a buffer cache block");
      
      /*
       * This lock synchronizes access to bd_request.
       */
      static struct mtx_padalign __exclusive_cache_line bdlock;
      
      /*
       * This lock protects the runningbufreq and synchronizes runningbufwakeup and
       * waitrunningbufspace().
       */
      static struct mtx_padalign __exclusive_cache_line rbreqlock;
      
      /*
       * Lock that protects bdirtywait.
       */
      static struct mtx_padalign __exclusive_cache_line bdirtylock;
      
      /*
       * Wakeup point for bufdaemon, as well as indicator of whether it is already
       * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
       * is idling.
       */
      static int bd_request;
      
      /*
       * Request for the buf daemon to write more buffers than is indicated by
       * lodirtybuf.  This may be necessary to push out excess dependencies or
       * defragment the address space where a simple count of the number of dirty
       * buffers is insufficient to characterize the demand for flushing them.
       */
      static int bd_speedupreq;
      
      /*
       * Synchronization (sleep/wakeup) variable for active buffer space requests.
       * Set when wait starts, cleared prior to wakeup().
       * Used in runningbufwakeup() and waitrunningbufspace().
       */
      static int runningbufreq;
      
      /*
       * Synchronization for bwillwrite() waiters.
       */
      static int bdirtywait;
      
      /*
       * Definitions for the buffer free lists.
       */
      #define QUEUE_NONE        0        /* on no queue */
      #define QUEUE_EMPTY        1        /* empty buffer headers */
      #define QUEUE_DIRTY        2        /* B_DELWRI buffers */
      #define QUEUE_CLEAN        3        /* non-B_DELWRI buffers */
      #define QUEUE_SENTINEL        4        /* not an queue index, but mark for sentinel */
      
      /* Maximum number of buffer domains. */
      #define        BUF_DOMAINS        8
      
      struct bufdomainset bdlodirty;                /* Domains > lodirty */
      struct bufdomainset bdhidirty;                /* Domains > hidirty */
      
      /* Configured number of clean queues. */
      static int __read_mostly buf_domains;
      
      BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
      struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
      struct bufqueue __exclusive_cache_line bqempty;
      
      /*
       * per-cpu empty buffer cache.
       */
      uma_zone_t buf_zone;
      
      /*
       * Single global constant for BUF_WMESG, to avoid getting multiple references.
       * buf_wmesg is referred from macros.
       */
      const char *buf_wmesg = BUF_WMESG;
      
      static int
      sysctl_runningspace(SYSCTL_HANDLER_ARGS)
      {
              long value;
              int error;
      
              value = *(long *)arg1;
              error = sysctl_handle_long(oidp, &value, 0, req);
              if (error != 0 || req->newptr == NULL)
                      return (error);
              mtx_lock(&rbreqlock);
              if (arg1 == &hirunningspace) {
                      if (value < lorunningspace)
                              error = EINVAL;
                      else
                              hirunningspace = value;
              } else {
                      KASSERT(arg1 == &lorunningspace,
                          ("%s: unknown arg1", __func__));
                      if (value > hirunningspace)
                              error = EINVAL;
                      else
                              lorunningspace = value;
              }
              mtx_unlock(&rbreqlock);
              return (error);
      }
      
      static int
      sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
      {
              int error;
              int value;
              int i;
      
              value = *(int *)arg1;
              error = sysctl_handle_int(oidp, &value, 0, req);
              if (error != 0 || req->newptr == NULL)
                      return (error);
              *(int *)arg1 = value;
              for (i = 0; i < buf_domains; i++)
                      *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
                          value / buf_domains;
      
              return (error);
      }
      
      static int
      sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
      {
              long value;
              int error;
              int i;
      
              value = *(long *)arg1;
              error = sysctl_handle_long(oidp, &value, 0, req);
              if (error != 0 || req->newptr == NULL)
                      return (error);
              *(long *)arg1 = value;
              for (i = 0; i < buf_domains; i++)
                      *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
                          value / buf_domains;
      
              return (error);
      }
      
      #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
          defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
      static int
      sysctl_bufspace(SYSCTL_HANDLER_ARGS)
      {
              long lvalue;
              int ivalue;
              int i;
      
              lvalue = 0;
              for (i = 0; i < buf_domains; i++)
                      lvalue += bdomain[i].bd_bufspace;
              if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
                      return (sysctl_handle_long(oidp, &lvalue, 0, req));
              if (lvalue > INT_MAX)
                      /* On overflow, still write out a long to trigger ENOMEM. */
                      return (sysctl_handle_long(oidp, &lvalue, 0, req));
              ivalue = lvalue;
              return (sysctl_handle_int(oidp, &ivalue, 0, req));
      }
      #else
      static int
      sysctl_bufspace(SYSCTL_HANDLER_ARGS)
      {
              long lvalue;
              int i;
      
              lvalue = 0;
              for (i = 0; i < buf_domains; i++)
                      lvalue += bdomain[i].bd_bufspace;
              return (sysctl_handle_long(oidp, &lvalue, 0, req));
      }
      #endif
      
      static int
      sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
      {
              int value;
              int i;
      
              value = 0;
              for (i = 0; i < buf_domains; i++)
                      value += bdomain[i].bd_numdirtybuffers;
              return (sysctl_handle_int(oidp, &value, 0, req));
      }
      
      /*
       *        bdirtywakeup:
       *
       *        Wakeup any bwillwrite() waiters.
       */
      static void
      bdirtywakeup(void)
      {
              mtx_lock(&bdirtylock);
              if (bdirtywait) {
                      bdirtywait = 0;
                      wakeup(&bdirtywait);
              }
              mtx_unlock(&bdirtylock);
      }
      
      /*
       *        bd_clear:
       *
       *        Clear a domain from the appropriate bitsets when dirtybuffers
       *        is decremented.
       */
      static void
      bd_clear(struct bufdomain *bd)
      {
      
              mtx_lock(&bdirtylock);
              if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
                      BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
              if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
                      BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
              mtx_unlock(&bdirtylock);
      }
      
      /*
       *        bd_set:
       *
       *        Set a domain in the appropriate bitsets when dirtybuffers
       *        is incremented.
       */
      static void
      bd_set(struct bufdomain *bd)
      {
      
              mtx_lock(&bdirtylock);
              if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
                      BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
              if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
                      BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
              mtx_unlock(&bdirtylock);
      }
      
      /*
       *        bdirtysub:
       *
       *        Decrement the numdirtybuffers count by one and wakeup any
       *        threads blocked in bwillwrite().
       */
      static void
      bdirtysub(struct buf *bp)
 1108 {
              struct bufdomain *bd;
              int num;
      
              bd = bufdomain(bp);
              num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
 1108         if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
                      bdirtywakeup();
 1108         if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
                      bd_clear(bd);
      }
      
      /*
       *        bdirtyadd:
       *
       *        Increment the numdirtybuffers count by one and wakeup the buf 
       *        daemon if needed.
       */
      static void
      bdirtyadd(struct buf *bp)
      {
              struct bufdomain *bd;
              int num;
      
              /*
               * Only do the wakeup once as we cross the boundary.  The
               * buf daemon will keep running until the condition clears.
               */
              bd = bufdomain(bp);
              num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
 1084         if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
                      bd_wakeup();
 1084         if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
                      bd_set(bd);
      }
      
      /*
       *        bufspace_daemon_wakeup:
       *
       *        Wakeup the daemons responsible for freeing clean bufs.
       */
      static void
      bufspace_daemon_wakeup(struct bufdomain *bd)
      {
      
              /*
               * avoid the lock if the daemon is running.
               */
              if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
                      BD_RUN_LOCK(bd);
                      atomic_store_int(&bd->bd_running, 1);
                      wakeup(&bd->bd_running);
                      BD_RUN_UNLOCK(bd);
              }
      }
      
      /*
       *        bufspace_daemon_wait:
       *
       *        Sleep until the domain falls below a limit or one second passes.
       */
      static void
      bufspace_daemon_wait(struct bufdomain *bd)
      {
              /*
               * Re-check our limits and sleep.  bd_running must be
               * cleared prior to checking the limits to avoid missed
               * wakeups.  The waker will adjust one of bufspace or
               * freebuffers prior to checking bd_running.
               */
              BD_RUN_LOCK(bd);
              atomic_store_int(&bd->bd_running, 0);
              if (bd->bd_bufspace < bd->bd_bufspacethresh &&
                  bd->bd_freebuffers > bd->bd_lofreebuffers) {
                      msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
                          "-", hz);
              } else {
                      /* Avoid spurious wakeups while running. */
                      atomic_store_int(&bd->bd_running, 1);
                      BD_RUN_UNLOCK(bd);
              }
      }
      
      /*
       *        bufspace_adjust:
       *
       *        Adjust the reported bufspace for a KVA managed buffer, possibly
       *         waking any waiters.
       */
      static void
      bufspace_adjust(struct buf *bp, int bufsize)
 1137 {
              struct bufdomain *bd;
              long space;
              int diff;
      
              KASSERT((bp->b_flags & B_MALLOC) == 0,
                  ("bufspace_adjust: malloc buf %p", bp));
              bd = bufdomain(bp);
              diff = bufsize - bp->b_bufsize;
              if (diff < 0) {
                      atomic_subtract_long(&bd->bd_bufspace, -diff);
              } else if (diff > 0) {
                      space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
                      /* Wake up the daemon on the transition. */
 1105                 if (space < bd->bd_bufspacethresh &&
                          space + diff >= bd->bd_bufspacethresh)
                              bufspace_daemon_wakeup(bd);
              }
              bp->b_bufsize = bufsize;
      }
      
      /*
       *        bufspace_reserve:
       *
       *        Reserve bufspace before calling allocbuf().  metadata has a
       *        different space limit than data.
       */
      static int
      bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
      {
              long limit, new;
              long space;
      
              if (metadata)
                      limit = bd->bd_maxbufspace;
              else
                      limit = bd->bd_hibufspace;
              space = atomic_fetchadd_long(&bd->bd_bufspace, size);
              new = space + size;
              if (new > limit) {
                      atomic_subtract_long(&bd->bd_bufspace, size);
                      return (ENOSPC);
              }
      
              /* Wake up the daemon on the transition. */
 1104         if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
                      bufspace_daemon_wakeup(bd);
      
              return (0);
      }
      
      /*
       *        bufspace_release:
       *
       *        Release reserved bufspace after bufspace_adjust() has consumed it.
       */
      static void
      bufspace_release(struct bufdomain *bd, int size)
      {
      
              atomic_subtract_long(&bd->bd_bufspace, size);
      }
      
      /*
       *        bufspace_wait:
       *
       *        Wait for bufspace, acting as the buf daemon if a locked vnode is
       *        supplied.  bd_wanted must be set prior to polling for space.  The
       *        operation must be re-tried on return.
       */
      static void
      bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
          int slpflag, int slptimeo)
      {
              struct thread *td;
              int error, fl, norunbuf;
      
              if ((gbflags & GB_NOWAIT_BD) != 0)
                      return;
      
              td = curthread;
              BD_LOCK(bd);
              while (bd->bd_wanted) {
                      if (vp != NULL && vp->v_type != VCHR &&
                          (td->td_pflags & TDP_BUFNEED) == 0) {
                              BD_UNLOCK(bd);
                              /*
                               * getblk() is called with a vnode locked, and
                               * some majority of the dirty buffers may as
                               * well belong to the vnode.  Flushing the
                               * buffers there would make a progress that
                               * cannot be achieved by the buf_daemon, that
                               * cannot lock the vnode.
                               */
                              norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
                                  (td->td_pflags & TDP_NORUNNINGBUF);
      
                              /*
                               * Play bufdaemon.  The getnewbuf() function
                               * may be called while the thread owns lock
                               * for another dirty buffer for the same
                               * vnode, which makes it impossible to use
                               * VOP_FSYNC() there, due to the buffer lock
                               * recursion.
                               */
                              td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
                              fl = buf_flush(vp, bd, flushbufqtarget);
                              td->td_pflags &= norunbuf;
                              BD_LOCK(bd);
                              if (fl != 0)
                                      continue;
                              if (bd->bd_wanted == 0)
                                      break;
                      }
                      error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
                          (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
                      if (error != 0)
                              break;
              }
              BD_UNLOCK(bd);
      }
      
      /*
       *        bufspace_daemon:
       *
       *        buffer space management daemon.  Tries to maintain some marginal
       *        amount of free buffer space so that requesting processes neither
       *        block nor work to reclaim buffers.
       */
      static void
      bufspace_daemon(void *arg)
      {
              struct bufdomain *bd;
      
              EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
                  SHUTDOWN_PRI_LAST + 100);
      
              bd = arg;
              for (;;) {
                      kthread_suspend_check();
      
                      /*
                       * Free buffers from the clean queue until we meet our
                       * targets.
                       *
                       * Theory of operation:  The buffer cache is most efficient
                       * when some free buffer headers and space are always
                       * available to getnewbuf().  This daemon attempts to prevent
                       * the excessive blocking and synchronization associated
                       * with shortfall.  It goes through three phases according
                       * demand:
                       *
                       * 1)        The daemon wakes up voluntarily once per-second
                       *        during idle periods when the counters are below
                       *        the wakeup thresholds (bufspacethresh, lofreebuffers).
                       *
                       * 2)        The daemon wakes up as we cross the thresholds
                       *        ahead of any potential blocking.  This may bounce
                       *        slightly according to the rate of consumption and
                       *        release.
                       *
                       * 3)        The daemon and consumers are starved for working
                       *        clean buffers.  This is the 'bufspace' sleep below
                       *        which will inefficiently trade bufs with bqrelse
                       *        until we return to condition 2.
                       */
                      while (bd->bd_bufspace > bd->bd_lobufspace ||
                          bd->bd_freebuffers < bd->bd_hifreebuffers) {
                              if (buf_recycle(bd, false) != 0) {
                                      if (bd_flushall(bd))
                                              continue;
                                      /*
                                       * Speedup dirty if we've run out of clean
                                       * buffers.  This is possible in particular
                                       * because softdep may held many bufs locked
                                       * pending writes to other bufs which are
                                       * marked for delayed write, exhausting
                                       * clean space until they are written.
                                       */
                                      bd_speedup();
                                      BD_LOCK(bd);
                                      if (bd->bd_wanted) {
                                              msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
                                                  PRIBIO|PDROP, "bufspace", hz/10);
                                      } else
                                              BD_UNLOCK(bd);
                              }
                              maybe_yield();
                      }
                      bufspace_daemon_wait(bd);
              }
      }
      
      /*
       *        bufmallocadjust:
       *
       *        Adjust the reported bufspace for a malloc managed buffer, possibly
       *        waking any waiters.
       */
      static void
      bufmallocadjust(struct buf *bp, int bufsize)
      {
              int diff;
      
              KASSERT((bp->b_flags & B_MALLOC) != 0,
                  ("bufmallocadjust: non-malloc buf %p", bp));
              diff = bufsize - bp->b_bufsize;
              if (diff < 0)
                      atomic_subtract_long(&bufmallocspace, -diff);
              else
                      atomic_add_long(&bufmallocspace, diff);
              bp->b_bufsize = bufsize;
      }
      
      /*
       *        runningwakeup:
       *
       *        Wake up processes that are waiting on asynchronous writes to fall
       *        below lorunningspace.
       */
      static void
      runningwakeup(void)
      {
      
              mtx_lock(&rbreqlock);
              if (runningbufreq) {
                      runningbufreq = 0;
                      wakeup(&runningbufreq);
              }
              mtx_unlock(&rbreqlock);
      }
      
      /*
       *        runningbufwakeup:
       *
       *        Decrement the outstanding write count according.
       */
      void
      runningbufwakeup(struct buf *bp)
   11 {
              long space, bspace;
      
              bspace = bp->b_runningbufspace;
   11         if (bspace == 0)
                      return;
              space = atomic_fetchadd_long(&runningbufspace, -bspace);
              KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
                  space, bspace));
              bp->b_runningbufspace = 0;
              /*
               * Only acquire the lock and wakeup on the transition from exceeding
               * the threshold to falling below it.
               */
              if (space < lorunningspace)
                      return;
              if (space - bspace > lorunningspace)
                      return;
              runningwakeup();
      }
      
      /*
       *        waitrunningbufspace()
       *
       *        runningbufspace is a measure of the amount of I/O currently
       *        running.  This routine is used in async-write situations to
       *        prevent creating huge backups of pending writes to a device.
       *        Only asynchronous writes are governed by this function.
       *
       *        This does NOT turn an async write into a sync write.  It waits  
       *        for earlier writes to complete and generally returns before the
       *        caller's write has reached the device.
       */
      void
      waitrunningbufspace(void)
      {
      
              mtx_lock(&rbreqlock);
              while (runningbufspace > hirunningspace) {
                      runningbufreq = 1;
                      msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
              }
              mtx_unlock(&rbreqlock);
      }
      
      /*
       *        vfs_buf_test_cache:
       *
       *        Called when a buffer is extended.  This function clears the B_CACHE
       *        bit if the newly extended portion of the buffer does not contain
       *        valid data.
       */
      static __inline void
      vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
          vm_offset_t size, vm_page_t m)
      {
      
              /*
               * This function and its results are protected by higher level
               * synchronization requiring vnode and buf locks to page in and
               * validate pages.
               */
              if (bp->b_flags & B_CACHE) {
                      int base = (foff + off) & PAGE_MASK;
    3                 if (vm_page_is_valid(m, base, size) == 0)
 1076                         bp->b_flags &= ~B_CACHE;
              }
      }
      
      /* Wake up the buffer daemon if necessary */
      static void
      bd_wakeup(void)
      {
      
              mtx_lock(&bdlock);
              if (bd_request == 0) {
                      bd_request = 1;
                      wakeup(&bd_request);
              }
              mtx_unlock(&bdlock);
      }
      
      /*
       * Adjust the maxbcachbuf tunable.
       */
      static void
      maxbcachebuf_adjust(void)
      {
              int i;
      
              /*
               * maxbcachebuf must be a power of 2 >= MAXBSIZE.
               */
              i = 2;
              while (i * 2 <= maxbcachebuf)
                      i *= 2;
              maxbcachebuf = i;
              if (maxbcachebuf < MAXBSIZE)
                      maxbcachebuf = MAXBSIZE;
              if (maxbcachebuf > MAXPHYS)
                      maxbcachebuf = MAXPHYS;
              if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
                      printf("maxbcachebuf=%d\n", maxbcachebuf);
      }
      
      /*
       * bd_speedup - speedup the buffer cache flushing code
       */
      void
      bd_speedup(void)
      {
              int needwake;
      
              mtx_lock(&bdlock);
              needwake = 0;
              if (bd_speedupreq == 0 || bd_request == 0)
                      needwake = 1;
              bd_speedupreq = 1;
              bd_request = 1;
              if (needwake)
                      wakeup(&bd_request);
              mtx_unlock(&bdlock);
      }
      
      #ifdef __i386__
      #define        TRANSIENT_DENOM        5
      #else
      #define        TRANSIENT_DENOM 10
      #endif
      
      /*
       * Calculating buffer cache scaling values and reserve space for buffer
       * headers.  This is called during low level kernel initialization and
       * may be called more then once.  We CANNOT write to the memory area
       * being reserved at this time.
       */
      caddr_t
      kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
      {
              int tuned_nbuf;
              long maxbuf, maxbuf_sz, buf_sz,        biotmap_sz;
      
              /*
               * physmem_est is in pages.  Convert it to kilobytes (assumes
               * PAGE_SIZE is >= 1K)
               */
              physmem_est = physmem_est * (PAGE_SIZE / 1024);
      
              maxbcachebuf_adjust();
              /*
               * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
               * For the first 64MB of ram nominally allocate sufficient buffers to
               * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
               * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
               * the buffer cache we limit the eventual kva reservation to
               * maxbcache bytes.
               *
               * factor represents the 1/4 x ram conversion.
               */
              if (nbuf == 0) {
                      int factor = 4 * BKVASIZE / 1024;
      
                      nbuf = 50;
                      if (physmem_est > 4096)
                              nbuf += min((physmem_est - 4096) / factor,
                                  65536 / factor);
                      if (physmem_est > 65536)
                              nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
                                  32 * 1024 * 1024 / (factor * 5));
      
                      if (maxbcache && nbuf > maxbcache / BKVASIZE)
                              nbuf = maxbcache / BKVASIZE;
                      tuned_nbuf = 1;
              } else
                      tuned_nbuf = 0;
      
              /* XXX Avoid unsigned long overflows later on with maxbufspace. */
              maxbuf = (LONG_MAX / 3) / BKVASIZE;
              if (nbuf > maxbuf) {
                      if (!tuned_nbuf)
                              printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
                                  maxbuf);
                      nbuf = maxbuf;
              }
      
              /*
               * Ideal allocation size for the transient bio submap is 10%
               * of the maximal space buffer map.  This roughly corresponds
               * to the amount of the buffer mapped for typical UFS load.
               *
               * Clip the buffer map to reserve space for the transient
               * BIOs, if its extent is bigger than 90% (80% on i386) of the
               * maximum buffer map extent on the platform.
               *
               * The fall-back to the maxbuf in case of maxbcache unset,
               * allows to not trim the buffer KVA for the architectures
               * with ample KVA space.
               */
              if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
                      maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
                      buf_sz = (long)nbuf * BKVASIZE;
                      if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
                          (TRANSIENT_DENOM - 1)) {
                              /*
                               * There is more KVA than memory.  Do not
                               * adjust buffer map size, and assign the rest
                               * of maxbuf to transient map.
                               */
                              biotmap_sz = maxbuf_sz - buf_sz;
                      } else {
                              /*
                               * Buffer map spans all KVA we could afford on
                               * this platform.  Give 10% (20% on i386) of
                               * the buffer map to the transient bio map.
                               */
                              biotmap_sz = buf_sz / TRANSIENT_DENOM;
                              buf_sz -= biotmap_sz;
                      }
                      if (biotmap_sz / INT_MAX > MAXPHYS)
                              bio_transient_maxcnt = INT_MAX;
                      else
                              bio_transient_maxcnt = biotmap_sz / MAXPHYS;
                      /*
                       * Artificially limit to 1024 simultaneous in-flight I/Os
                       * using the transient mapping.
                       */
                      if (bio_transient_maxcnt > 1024)
                              bio_transient_maxcnt = 1024;
                      if (tuned_nbuf)
                              nbuf = buf_sz / BKVASIZE;
              }
      
              if (nswbuf == 0) {
                      nswbuf = min(nbuf / 4, 256);
                      if (nswbuf < NSWBUF_MIN)
                              nswbuf = NSWBUF_MIN;
              }
      
              /*
               * Reserve space for the buffer cache buffers
               */
              buf = (void *)v;
              v = (caddr_t)(buf + nbuf);
      
              return(v);
      }
      
      /* Initialize the buffer subsystem.  Called before use of any buffers. */
      void
      bufinit(void)
      {
              struct buf *bp;
              int i;
      
              KASSERT(maxbcachebuf >= MAXBSIZE,
                  ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
                  MAXBSIZE));
              bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
              mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
              mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
              mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
      
              unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
      
              /* finally, initialize each buffer header and stick on empty q */
              for (i = 0; i < nbuf; i++) {
                      bp = &buf[i];
                      bzero(bp, sizeof *bp);
                      bp->b_flags = B_INVAL;
                      bp->b_rcred = NOCRED;
                      bp->b_wcred = NOCRED;
                      bp->b_qindex = QUEUE_NONE;
                      bp->b_domain = -1;
                      bp->b_subqueue = mp_maxid + 1;
                      bp->b_xflags = 0;
                      bp->b_data = bp->b_kvabase = unmapped_buf;
                      LIST_INIT(&bp->b_dep);
                      BUF_LOCKINIT(bp);
                      bq_insert(&bqempty, bp, false);
              }
      
              /*
               * maxbufspace is the absolute maximum amount of buffer space we are 
               * allowed to reserve in KVM and in real terms.  The absolute maximum
               * is nominally used by metadata.  hibufspace is the nominal maximum
               * used by most other requests.  The differential is required to 
               * ensure that metadata deadlocks don't occur.
               *
               * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
               * this may result in KVM fragmentation which is not handled optimally
               * by the system. XXX This is less true with vmem.  We could use
               * PAGE_SIZE.
               */
              maxbufspace = (long)nbuf * BKVASIZE;
              hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
              lobufspace = (hibufspace / 20) * 19; /* 95% */
              bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
      
              /*
               * Note: The 16 MiB upper limit for hirunningspace was chosen
               * arbitrarily and may need further tuning. It corresponds to
               * 128 outstanding write IO requests (if IO size is 128 KiB),
               * which fits with many RAID controllers' tagged queuing limits.
               * The lower 1 MiB limit is the historical upper limit for
               * hirunningspace.
               */
              hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
                  16 * 1024 * 1024), 1024 * 1024);
              lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
      
              /*
               * Limit the amount of malloc memory since it is wired permanently into
               * the kernel space.  Even though this is accounted for in the buffer
               * allocation, we don't want the malloced region to grow uncontrolled.
               * The malloc scheme improves memory utilization significantly on
               * average (small) directories.
               */
              maxbufmallocspace = hibufspace / 20;
      
              /*
               * Reduce the chance of a deadlock occurring by limiting the number
               * of delayed-write dirty buffers we allow to stack up.
               */
              hidirtybuffers = nbuf / 4 + 20;
              dirtybufthresh = hidirtybuffers * 9 / 10;
              /*
               * To support extreme low-memory systems, make sure hidirtybuffers
               * cannot eat up all available buffer space.  This occurs when our
               * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
               * buffer space assuming BKVASIZE'd buffers.
               */
              while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
                      hidirtybuffers >>= 1;
              }
              lodirtybuffers = hidirtybuffers / 2;
      
              /*
               * lofreebuffers should be sufficient to avoid stalling waiting on
               * buf headers under heavy utilization.  The bufs in per-cpu caches
               * are counted as free but will be unavailable to threads executing
               * on other cpus.
               *
               * hifreebuffers is the free target for the bufspace daemon.  This
               * should be set appropriately to limit work per-iteration.
               */
              lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
              hifreebuffers = (3 * lofreebuffers) / 2;
              numfreebuffers = nbuf;
      
              /* Setup the kva and free list allocators. */
              vmem_set_reclaim(buffer_arena, bufkva_reclaim);
              buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
                  NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
      
              /*
               * Size the clean queue according to the amount of buffer space.
               * One queue per-256mb up to the max.  More queues gives better
               * concurrency but less accurate LRU.
               */
              buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS);
              for (i = 0 ; i < buf_domains; i++) {
                      struct bufdomain *bd;
      
                      bd = &bdomain[i];
                      bd_init(bd);
                      bd->bd_freebuffers = nbuf / buf_domains;
                      bd->bd_hifreebuffers = hifreebuffers / buf_domains;
                      bd->bd_lofreebuffers = lofreebuffers / buf_domains;
                      bd->bd_bufspace = 0;
                      bd->bd_maxbufspace = maxbufspace / buf_domains;
                      bd->bd_hibufspace = hibufspace / buf_domains;
                      bd->bd_lobufspace = lobufspace / buf_domains;
                      bd->bd_bufspacethresh = bufspacethresh / buf_domains;
                      bd->bd_numdirtybuffers = 0;
                      bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
                      bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
                      bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
                      /* Don't allow more than 2% of bufs in the per-cpu caches. */
                      bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
              }
              getnewbufcalls = counter_u64_alloc(M_WAITOK);
              getnewbufrestarts = counter_u64_alloc(M_WAITOK);
              mappingrestarts = counter_u64_alloc(M_WAITOK);
              numbufallocfails = counter_u64_alloc(M_WAITOK);
              notbufdflushes = counter_u64_alloc(M_WAITOK);
              buffreekvacnt = counter_u64_alloc(M_WAITOK);
              bufdefragcnt = counter_u64_alloc(M_WAITOK);
              bufkvaspace = counter_u64_alloc(M_WAITOK);
      }
      
      #ifdef INVARIANTS
      static inline void
      vfs_buf_check_mapped(struct buf *bp)
      {
      
              KASSERT(bp->b_kvabase != unmapped_buf,
                  ("mapped buf: b_kvabase was not updated %p", bp));
              KASSERT(bp->b_data != unmapped_buf,
                  ("mapped buf: b_data was not updated %p", bp));
 1080         KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
                  MAXPHYS, ("b_data + b_offset unmapped %p", bp));
      }
      
      static inline void
      vfs_buf_check_unmapped(struct buf *bp)
      {
      
              KASSERT(bp->b_data == unmapped_buf,
                  ("unmapped buf: corrupted b_data %p", bp));
      }
      
      #define        BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
      #define        BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
      #else
      #define        BUF_CHECK_MAPPED(bp) do {} while (0)
      #define        BUF_CHECK_UNMAPPED(bp) do {} while (0)
      #endif
      
      static int
      isbufbusy(struct buf *bp)
      {
              if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
                  ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
                      return (1);
              return (0);
      }
      
      /*
       * Shutdown the system cleanly to prepare for reboot, halt, or power off.
       */
      void
      bufshutdown(int show_busybufs)
      {
              static int first_buf_printf = 1;
              struct buf *bp;
              int iter, nbusy, pbusy;
      #ifndef PREEMPTION
              int subiter;
      #endif
      
              /* 
               * Sync filesystems for shutdown
               */
              wdog_kern_pat(WD_LASTVAL);
              kern_sync(curthread);
      
              /*
               * With soft updates, some buffers that are
               * written will be remarked as dirty until other
               * buffers are written.
               */
              for (iter = pbusy = 0; iter < 20; iter++) {
                      nbusy = 0;
                      for (bp = &buf[nbuf]; --bp >= buf; )
                              if (isbufbusy(bp))
                                      nbusy++;
                      if (nbusy == 0) {
                              if (first_buf_printf)
                                      printf("All buffers synced.");
                              break;
                      }
                      if (first_buf_printf) {
                              printf("Syncing disks, buffers remaining... ");
                              first_buf_printf = 0;
                      }
                      printf("%d ", nbusy);
                      if (nbusy < pbusy)
                              iter = 0;
                      pbusy = nbusy;
      
                      wdog_kern_pat(WD_LASTVAL);
                      kern_sync(curthread);
      
      #ifdef PREEMPTION
                      /*
                       * Spin for a while to allow interrupt threads to run.
                       */
                      DELAY(50000 * iter);
      #else
                      /*
                       * Context switch several times to allow interrupt
                       * threads to run.
                       */
                      for (subiter = 0; subiter < 50 * iter; subiter++) {
                              thread_lock(curthread);
                              mi_switch(SW_VOL);
                              DELAY(1000);
                      }
      #endif
              }
              printf("\n");
              /*
               * Count only busy local buffers to prevent forcing 
               * a fsck if we're just a client of a wedged NFS server
               */
              nbusy = 0;
              for (bp = &buf[nbuf]; --bp >= buf; ) {
                      if (isbufbusy(bp)) {
      #if 0
      /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
                              if (bp->b_dev == NULL) {
                                      TAILQ_REMOVE(&mountlist,
                                          bp->b_vp->v_mount, mnt_list);
                                      continue;
                              }
      #endif
                              nbusy++;
                              if (show_busybufs > 0) {
                                      printf(
                  "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
                                          nbusy, bp, bp->b_vp, bp->b_flags,
                                          (intmax_t)bp->b_blkno,
                                          (intmax_t)bp->b_lblkno);
                                      BUF_LOCKPRINTINFO(bp);
                                      if (show_busybufs > 1)
                                              vn_printf(bp->b_vp,
                                                  "vnode content: ");
                              }
                      }
              }
              if (nbusy) {
                      /*
                       * Failed to sync all blocks. Indicate this and don't
                       * unmount filesystems (thus forcing an fsck on reboot).
                       */
                      printf("Giving up on %d buffers\n", nbusy);
                      DELAY(5000000);        /* 5 seconds */
              } else {
                      if (!first_buf_printf)
                              printf("Final sync complete\n");
                      /*
                       * Unmount filesystems
                       */
                      if (!KERNEL_PANICKED())
                              vfs_unmountall();
              }
              swapoff_all();
              DELAY(100000);                /* wait for console output to finish */
      }
      
      static void
      bpmap_qenter(struct buf *bp)
 1053 {
      
              BUF_CHECK_MAPPED(bp);
      
              /*
               * bp->b_data is relative to bp->b_offset, but
               * bp->b_offset may be offset into the first page.
               */
 1053         bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
              pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
              bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
                  (vm_offset_t)(bp->b_offset & PAGE_MASK));
      }
      
      static inline struct bufdomain *
      bufdomain(struct buf *bp)
      {
      
              return (&bdomain[bp->b_domain]);
      }
      
      static struct bufqueue *
      bufqueue(struct buf *bp)
 1116 {
      
              switch (bp->b_qindex) {
              case QUEUE_NONE:
                      /* FALLTHROUGH */
              case QUEUE_SENTINEL:
                      return (NULL);
              case QUEUE_EMPTY:
    9                 return (&bqempty);
              case QUEUE_DIRTY:
 1083                 return (&bufdomain(bp)->bd_dirtyq);
              case QUEUE_CLEAN:
 1082                 return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
              default:
                      break;
              }
              panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
      }
      
      /*
       * Return the locked bufqueue that bp is a member of.
       */
      static struct bufqueue *
      bufqueue_acquire(struct buf *bp)
 1116 {
              struct bufqueue *bq, *nbq;
      
              /*
               * bp can be pushed from a per-cpu queue to the
               * cleanq while we're waiting on the lock.  Retry
               * if the queues don't match.
               */
              bq = bufqueue(bp);
              BQ_LOCK(bq);
              for (;;) {
                      nbq = bufqueue(bp);
 1116                 if (bq == nbq)
                              break;
                      BQ_UNLOCK(bq);
                      BQ_LOCK(nbq);
                      bq = nbq;
              }
              return (bq);
      }
      
      /*
       *        binsfree:
       *
       *        Insert the buffer into the appropriate free list.  Requires a
       *        locked buffer on entry and buffer is unlocked before return.
       */
      static void
      binsfree(struct buf *bp, int qindex)
 2651 {
              struct bufdomain *bd;
              struct bufqueue *bq;
      
              KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
                  ("binsfree: Invalid qindex %d", qindex));
              BUF_ASSERT_XLOCKED(bp);
      
              /*
               * Handle delayed bremfree() processing.
               */
 1076         if (bp->b_flags & B_REMFREE) {
                      if (bp->b_qindex == qindex) {
                              bp->b_flags |= B_REUSE;
 2644                         bp->b_flags &= ~B_REMFREE;
                              BUF_UNLOCK(bp);
                              return;
                      }
 1083                 bq = bufqueue_acquire(bp);
                      bq_remove(bq, bp);
                      BQ_UNLOCK(bq);
              }
              bd = bufdomain(bp);
              if (qindex == QUEUE_CLEAN) {
                      if (bd->bd_lim != 0)
 1084                         bq = &bd->bd_subq[PCPU_GET(cpuid)];
                      else
                              bq = bd->bd_cleanq;
              } else
 1084                 bq = &bd->bd_dirtyq;
              bq_insert(bq, bp, true);
      }
      
      /*
       * buf_free:
       *
       *        Free a buffer to the buf zone once it no longer has valid contents.
       */
      static void
      buf_free(struct buf *bp)
      {
      
    4         if (bp->b_flags & B_REMFREE)
 1064                 bremfreef(bp);
              if (bp->b_vflags & BV_BKGRDINPROG)
                      panic("losing buffer 1");
 1066         if (bp->b_rcred != NOCRED) {
                      crfree(bp->b_rcred);
                      bp->b_rcred = NOCRED;
              }
 1066         if (bp->b_wcred != NOCRED) {
                      crfree(bp->b_wcred);
                      bp->b_wcred = NOCRED;
              }
 1066         if (!LIST_EMPTY(&bp->b_dep))
                      buf_deallocate(bp);
              bufkva_free(bp);
              atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
 1066         BUF_UNLOCK(bp);
              uma_zfree(buf_zone, bp);
      }
      
      /*
       * buf_import:
       *
       *        Import bufs into the uma cache from the buf list.  The system still
       *        expects a static array of bufs and much of the synchronization
       *        around bufs assumes type stable storage.  As a result, UMA is used
       *        only as a per-cpu cache of bufs still maintained on a global list.
       */
      static int
      buf_import(void *arg, void **store, int cnt, int domain, int flags)
    9 {
              struct buf *bp;
              int i;
      
              BQ_LOCK(&bqempty);
    9         for (i = 0; i < cnt; i++) {
                      bp = TAILQ_FIRST(&bqempty.bq_queue);
                      if (bp == NULL)
                              break;
                      bq_remove(&bqempty, bp);
                      store[i] = bp;
              }
              BQ_UNLOCK(&bqempty);
      
              return (i);
      }
      
      /*
       * buf_release:
       *
       *        Release bufs from the uma cache back to the buffer queues.
       */
      static void
      buf_release(void *arg, void **store, int cnt)
      {
              struct bufqueue *bq;
              struct buf *bp;
              int i;
      
              bq = &bqempty;
              BQ_LOCK(bq);
              for (i = 0; i < cnt; i++) {
                      bp = store[i];
                      /* Inline bq_insert() to batch locking. */
                      TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
                      bp->b_flags &= ~(B_AGE | B_REUSE);
                      bq->bq_len++;
                      bp->b_qindex = bq->bq_index;
              }
              BQ_UNLOCK(bq);
      }
      
      /*
       * buf_alloc:
       *
       *        Allocate an empty buffer header.
       */
      static struct buf *
      buf_alloc(struct bufdomain *bd)
      {
              struct buf *bp;
              int freebufs;
      
              /*
               * We can only run out of bufs in the buf zone if the average buf
               * is less than BKVASIZE.  In this case the actual wait/block will
               * come from buf_reycle() failing to flush one of these small bufs.
               */
              bp = NULL;
              freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
              if (freebufs > 0)
                      bp = uma_zalloc(buf_zone, M_NOWAIT);
              if (bp == NULL) {
                      atomic_add_int(&bd->bd_freebuffers, 1);
                      bufspace_daemon_wakeup(bd);
                      counter_u64_add(numbufallocfails, 1);
                      return (NULL);
              }
              /*
               * Wake-up the bufspace daemon on transition below threshold.
               */
 1104         if (freebufs == bd->bd_lofreebuffers)
                      bufspace_daemon_wakeup(bd);
      
              if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
                      panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
              
              KASSERT(bp->b_vp == NULL,
                  ("bp: %p still has vnode %p.", bp, bp->b_vp));
              KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
                  ("invalid buffer %p flags %#x", bp, bp->b_flags));
              KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
                  ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
              KASSERT(bp->b_npages == 0,
                  ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
              KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
              KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
      
              bp->b_domain = BD_DOMAIN(bd);
              bp->b_flags = 0;
              bp->b_ioflags = 0;
              bp->b_xflags = 0;
              bp->b_vflags = 0;
              bp->b_vp = NULL;
              bp->b_blkno = bp->b_lblkno = 0;
              bp->b_offset = NOOFFSET;
              bp->b_iodone = 0;
              bp->b_error = 0;
              bp->b_resid = 0;
              bp->b_bcount = 0;
              bp->b_npages = 0;
              bp->b_dirtyoff = bp->b_dirtyend = 0;
              bp->b_bufobj = NULL;
              bp->b_data = bp->b_kvabase = unmapped_buf;
              bp->b_fsprivate1 = NULL;
              bp->b_fsprivate2 = NULL;
              bp->b_fsprivate3 = NULL;
              LIST_INIT(&bp->b_dep);
      
              return (bp);
      }
      
      /*
       *        buf_recycle:
       *
       *        Free a buffer from the given bufqueue.  kva controls whether the
       *        freed buf must own some kva resources.  This is used for
       *        defragmenting.
       */
      static int
      buf_recycle(struct bufdomain *bd, bool kva)
      {
              struct bufqueue *bq;
              struct buf *bp, *nbp;
      
              if (kva)
                      counter_u64_add(bufdefragcnt, 1);
              nbp = NULL;
              bq = bd->bd_cleanq;
              BQ_LOCK(bq);
              KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
                  ("buf_recycle: Locks don't match"));
              nbp = TAILQ_FIRST(&bq->bq_queue);
      
              /*
               * Run scan, possibly freeing data and/or kva mappings on the fly
               * depending.
               */
              while ((bp = nbp) != NULL) {
                      /*
                       * Calculate next bp (we can only use it if we do not
                       * release the bqlock).
                       */
                      nbp = TAILQ_NEXT(bp, b_freelist);
      
                      /*
                       * If we are defragging then we need a buffer with 
                       * some kva to reclaim.
                       */
                      if (kva && bp->b_kvasize == 0)
                              continue;
      
                      if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
                              continue;
      
                      /*
                       * Implement a second chance algorithm for frequently
                       * accessed buffers.
                       */
                      if ((bp->b_flags & B_REUSE) != 0) {
                              TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
                              TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
                              bp->b_flags &= ~B_REUSE;
                              BUF_UNLOCK(bp);
                              continue;
                      }
      
                      /*
                       * Skip buffers with background writes in progress.
                       */
                      if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
                              BUF_UNLOCK(bp);
                              continue;
                      }
      
                      KASSERT(bp->b_qindex == QUEUE_CLEAN,
                          ("buf_recycle: inconsistent queue %d bp %p",
                          bp->b_qindex, bp));
                      KASSERT(bp->b_domain == BD_DOMAIN(bd),
                          ("getnewbuf: queue domain %d doesn't match request %d",
                          bp->b_domain, (int)BD_DOMAIN(bd)));
                      /*
                       * NOTE:  nbp is now entirely invalid.  We can only restart
                       * the scan from this point on.
                       */
                      bq_remove(bq, bp);
                      BQ_UNLOCK(bq);
      
                      /*
                       * Requeue the background write buffer with error and
                       * restart the scan.
                       */
                      if ((bp->b_vflags & BV_BKGRDERR) != 0) {
                              bqrelse(bp);
                              BQ_LOCK(bq);
                              nbp = TAILQ_FIRST(&bq->bq_queue);
                              continue;
                      }
                      bp->b_flags |= B_INVAL;
                      brelse(bp);
                      return (0);
              }
              bd->bd_wanted = 1;
              BQ_UNLOCK(bq);
      
              return (ENOBUFS);
      }
      
      /*
       *        bremfree:
       *
       *        Mark the buffer for removal from the appropriate free list.
       *        
       */
      void
      bremfree(struct buf *bp)
 1092 {
      
              CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
              KASSERT((bp->b_flags & B_REMFREE) == 0,
                  ("bremfree: buffer %p already marked for delayed removal.", bp));
              KASSERT(bp->b_qindex != QUEUE_NONE,
                  ("bremfree: buffer %p not on a queue.", bp));
 2649         BUF_ASSERT_XLOCKED(bp);
      
              bp->b_flags |= B_REMFREE;
      }
      
      /*
       *        bremfreef:
       *
       *        Force an immediate removal from a free list.  Used only in nfs when
       *        it abuses the b_freelist pointer.
       */
      void
      bremfreef(struct buf *bp)
      {
              struct bufqueue *bq;
      
              bq = bufqueue_acquire(bp);
              bq_remove(bq, bp);
              BQ_UNLOCK(bq);
      }
      
      static void
      bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname)
      {
      
              mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF);
              TAILQ_INIT(&bq->bq_queue);
              bq->bq_len = 0;
              bq->bq_index = qindex;
              bq->bq_subqueue = subqueue;
      }
      
      static void
      bd_init(struct bufdomain *bd)
      {
              int i;
      
              bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
              bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
              bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
              for (i = 0; i <= mp_maxid; i++)
                      bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
                          "bufq clean subqueue lock");
              mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF);
      }
      
      /*
       *        bq_remove:
       *
       *        Removes a buffer from the free list, must be called with the
       *        correct qlock held.
       */
      static void
      bq_remove(struct bufqueue *bq, struct buf *bp)
 1118 {
      
              CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X",
                  bp, bp->b_vp, bp->b_flags);
              KASSERT(bp->b_qindex != QUEUE_NONE,
                  ("bq_remove: buffer %p not on a queue.", bp));
 1116         KASSERT(bufqueue(bp) == bq,
                  ("bq_remove: Remove buffer %p from wrong queue.", bp));
      
              BQ_ASSERT_LOCKED(bq);
    9         if (bp->b_qindex != QUEUE_EMPTY) {
 1116                 BUF_ASSERT_XLOCKED(bp);
              }
              KASSERT(bq->bq_len >= 1,
                  ("queue %d underflow", bp->b_qindex));
 1118         TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
              bq->bq_len--;
              bp->b_qindex = QUEUE_NONE;
              bp->b_flags &= ~(B_REMFREE | B_REUSE);
      }
      
      static void
      bd_flush(struct bufdomain *bd, struct bufqueue *bq)
      {
              struct buf *bp;
      
              BQ_ASSERT_LOCKED(bq);
              if (bq != bd->bd_cleanq) {
                      BD_LOCK(bd);
                      while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) {
                              TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
                              TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp,
                                  b_freelist);
                              bp->b_subqueue = bd->bd_cleanq->bq_subqueue;
                      }
                      bd->bd_cleanq->bq_len += bq->bq_len;
                      bq->bq_len = 0;
              }
              if (bd->bd_wanted) {
                      bd->bd_wanted = 0;
                      wakeup(&bd->bd_wanted);
              }
              if (bq != bd->bd_cleanq)
                      BD_UNLOCK(bd);
      }
      
      static int
      bd_flushall(struct bufdomain *bd)
      {
              struct bufqueue *bq;
              int flushed;
              int i;
      
              if (bd->bd_lim == 0)
                      return (0);
              flushed = 0;
              for (i = 0; i <= mp_maxid; i++) {
                      bq = &bd->bd_subq[i];
                      if (bq->bq_len == 0)
                              continue;
                      BQ_LOCK(bq);
                      bd_flush(bd, bq);
                      BQ_UNLOCK(bq);
                      flushed++;
              }
      
              return (flushed);
      }
      
      static void
      bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock)
 1119 {
              struct bufdomain *bd;
      
              if (bp->b_qindex != QUEUE_NONE)
                      panic("bq_insert: free buffer %p onto another queue?", bp);
      
              bd = bufdomain(bp);
              if (bp->b_flags & B_AGE) {
                      /* Place this buf directly on the real queue. */
                      if (bq->bq_index == QUEUE_CLEAN)
                              bq = bd->bd_cleanq;
                      BQ_LOCK(bq);
                      TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist);
              } else {
                      BQ_LOCK(bq);
 1119                 TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
              }
              bp->b_flags &= ~(B_AGE | B_REUSE);
              bq->bq_len++;
              bp->b_qindex = bq->bq_index;
              bp->b_subqueue = bq->bq_subqueue;
      
              /*
               * Unlock before we notify so that we don't wakeup a waiter that
               * fails a trylock on the buf and sleeps again.
               */
              if (unlock)
 1119                 BUF_UNLOCK(bp);
      
 1084         if (bp->b_qindex == QUEUE_CLEAN) {
                      /*
                       * Flush the per-cpu queue and notify any waiters.
                       */
 1084                 if (bd->bd_wanted || (bq != bd->bd_cleanq &&
                          bq->bq_len >= bd->bd_lim))
                              bd_flush(bd, bq);
              }
              BQ_UNLOCK(bq);
      }
      
      /*
       *        bufkva_free:
       *
       *        Free the kva allocation for a buffer.
       *
       */
      static void
      bufkva_free(struct buf *bp)
 1119 {
      
      #ifdef INVARIANTS
              if (bp->b_kvasize == 0) {
 1097                 KASSERT(bp->b_kvabase == unmapped_buf &&
                          bp->b_data == unmapped_buf,
                          ("Leaked KVA space on %p", bp));
              } else if (buf_mapped(bp))
                      BUF_CHECK_MAPPED(bp);
              else
                      BUF_CHECK_UNMAPPED(bp);
      #endif
              if (bp->b_kvasize == 0)
                      return;
      
   38         vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
              counter_u64_add(bufkvaspace, -bp->b_kvasize);
              counter_u64_add(buffreekvacnt, 1);
              bp->b_data = bp->b_kvabase = unmapped_buf;
              bp->b_kvasize = 0;
      }
      
      /*
       *        bufkva_alloc:
       *
       *        Allocate the buffer KVA and set b_kvasize and b_kvabase.
       */
      static int
      bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
 1080 {
              vm_offset_t addr;
              int error;
      
              KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
                  ("Invalid gbflags 0x%x in %s", gbflags, __func__));
      
              bufkva_free(bp);
      
              addr = 0;
              error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
              if (error != 0) {
                      /*
                       * Buffer map is too fragmented.  Request the caller
                       * to defragment the map.
                       */
                      return (error);
              }
              bp->b_kvabase = (caddr_t)addr;
              bp->b_kvasize = maxsize;
              counter_u64_add(bufkvaspace, bp->b_kvasize);
              if ((gbflags & GB_UNMAPPED) != 0) {
                      bp->b_data = unmapped_buf;
                      BUF_CHECK_UNMAPPED(bp);
              } else {
                      bp->b_data = bp->b_kvabase;
                      BUF_CHECK_MAPPED(bp);
              }
              return (0);
      }
      
      /*
       *        bufkva_reclaim:
       *
       *        Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
       *        callback that fires to avoid returning failure.
       */
      static void
      bufkva_reclaim(vmem_t *vmem, int flags)
      {
              bool done;
              int q;
              int i;
      
              done = false;
              for (i = 0; i < 5; i++) {
                      for (q = 0; q < buf_domains; q++)
                              if (buf_recycle(&bdomain[q], true) != 0)
                                      done = true;
                      if (done)
                              break;
              }
              return;
      }
      
      /*
       * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
       * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
       * the buffer is valid and we do not have to do anything.
       */
      static void
      breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt,
          struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *))
 2646 {
              struct buf *rabp;
              struct thread *td;
              int i;
      
              td = curthread;
      
 2646         for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
                      if (inmem(vp, *rablkno))
                              continue;
                      rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
                      if ((rabp->b_flags & B_CACHE) != 0) {
                              brelse(rabp);
                              continue;
                      }
      #ifdef RACCT
                      if (racct_enable) {
                              PROC_LOCK(curproc);
                              racct_add_buf(curproc, rabp, 0);
                              PROC