/*        $NetBSD: uvm_map.c,v 1.366 2019/11/01 13:04:22 rin Exp $        */
      
      /*
       * Copyright (c) 1997 Charles D. Cranor and Washington University.
       * Copyright (c) 1991, 1993, The Regents of the University of California.
       *
       * All rights reserved.
       *
       * This code is derived from software contributed to Berkeley by
       * The Mach Operating System project at Carnegie-Mellon University.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)vm_map.c    8.3 (Berkeley) 1/12/94
       * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
       *
       *
       * Copyright (c) 1987, 1990 Carnegie-Mellon University.
       * All rights reserved.
       *
       * Permission to use, copy, modify and distribute this software and
       * its documentation is hereby granted, provided that both the copyright
       * notice and this permission notice appear in all copies of the
       * software, derivative works or modified versions, and any portions
       * thereof, and that both notices appear in supporting documentation.
       *
       * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
       * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
       * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
       *
       * Carnegie Mellon requests users of this software to return to
       *
       *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
       *  School of Computer Science
       *  Carnegie Mellon University
       *  Pittsburgh PA 15213-3890
       *
       * any improvements or extensions that they make and grant Carnegie the
       * rights to redistribute these changes.
       */
      
      /*
       * uvm_map.c: uvm map operations
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.366 2019/11/01 13:04:22 rin Exp $");
      
      #include "opt_ddb.h"
      #include "opt_pax.h"
      #include "opt_uvmhist.h"
      #include "opt_uvm.h"
      #include "opt_sysv.h"
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/mman.h>
      #include <sys/proc.h>
      #include <sys/pool.h>
      #include <sys/kernel.h>
      #include <sys/mount.h>
      #include <sys/pax.h>
      #include <sys/vnode.h>
      #include <sys/filedesc.h>
      #include <sys/lockdebug.h>
      #include <sys/atomic.h>
      #include <sys/sysctl.h>
      #ifndef __USER_VA0_IS_SAFE
      #include <sys/kauth.h>
      #include "opt_user_va0_disable_default.h"
      #endif
      
      #include <sys/shm.h>
      
      #include <uvm/uvm.h>
      #include <uvm/uvm_readahead.h>
      
      #if defined(DDB) || defined(DEBUGPRINT)
      #include <uvm/uvm_ddb.h>
      #endif
      
      #ifdef UVMHIST
      #ifndef UVMHIST_MAPHIST_SIZE
      #define UVMHIST_MAPHIST_SIZE 100
      #endif
      #ifndef UVMHIST_PDHIST_SIZE
      #define UVMHIST_PDHIST_SIZE 100
      #endif
      static struct kern_history_ent maphistbuf[UVMHIST_MAPHIST_SIZE];
      UVMHIST_DEFINE(maphist) = UVMHIST_INITIALIZER(maphist, maphistbuf);
      #endif
      
      #if !defined(UVMMAP_COUNTERS)
      
      #define        UVMMAP_EVCNT_DEFINE(name)        /* nothing */
      #define UVMMAP_EVCNT_INCR(ev)                /* nothing */
      #define UVMMAP_EVCNT_DECR(ev)                /* nothing */
      
      #else /* defined(UVMMAP_NOCOUNTERS) */
      
      #include <sys/evcnt.h>
      #define        UVMMAP_EVCNT_DEFINE(name) \
      struct evcnt uvmmap_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
          "uvmmap", #name); \
      EVCNT_ATTACH_STATIC(uvmmap_evcnt_##name);
      #define        UVMMAP_EVCNT_INCR(ev)                uvmmap_evcnt_##ev.ev_count++
      #define        UVMMAP_EVCNT_DECR(ev)                uvmmap_evcnt_##ev.ev_count--
      
      #endif /* defined(UVMMAP_NOCOUNTERS) */
      
      UVMMAP_EVCNT_DEFINE(ubackmerge)
      UVMMAP_EVCNT_DEFINE(uforwmerge)
      UVMMAP_EVCNT_DEFINE(ubimerge)
      UVMMAP_EVCNT_DEFINE(unomerge)
      UVMMAP_EVCNT_DEFINE(kbackmerge)
      UVMMAP_EVCNT_DEFINE(kforwmerge)
      UVMMAP_EVCNT_DEFINE(kbimerge)
      UVMMAP_EVCNT_DEFINE(knomerge)
      UVMMAP_EVCNT_DEFINE(map_call)
      UVMMAP_EVCNT_DEFINE(mlk_call)
      UVMMAP_EVCNT_DEFINE(mlk_hint)
      UVMMAP_EVCNT_DEFINE(mlk_list)
      UVMMAP_EVCNT_DEFINE(mlk_tree)
      UVMMAP_EVCNT_DEFINE(mlk_treeloop)
      UVMMAP_EVCNT_DEFINE(mlk_listloop)
      
      const char vmmapbsy[] = "vmmapbsy";
      
      /*
       * cache for vmspace structures.
       */
      
      static struct pool_cache uvm_vmspace_cache;
      
      /*
       * cache for dynamically-allocated map entries.
       */
      
      static struct pool_cache uvm_map_entry_cache;
      
      #ifdef PMAP_GROWKERNEL
      /*
       * This global represents the end of the kernel virtual address
       * space.  If we want to exceed this, we must grow the kernel
       * virtual address space dynamically.
       *
       * Note, this variable is locked by kernel_map's lock.
       */
      vaddr_t uvm_maxkaddr;
      #endif
      
      #ifndef __USER_VA0_IS_SAFE
      #ifndef __USER_VA0_DISABLE_DEFAULT
      #define __USER_VA0_DISABLE_DEFAULT 1
      #endif
      #ifdef USER_VA0_DISABLE_DEFAULT /* kernel config option overrides */
      #undef __USER_VA0_DISABLE_DEFAULT
      #define __USER_VA0_DISABLE_DEFAULT USER_VA0_DISABLE_DEFAULT
      #endif
      int user_va0_disable = __USER_VA0_DISABLE_DEFAULT;
      #endif
      
      /*
       * macros
       */
      
      /*
       * uvm_map_align_va: round down or up virtual address
       */
      static __inline void
      uvm_map_align_va(vaddr_t *vap, vsize_t align, int topdown)
      {
      
   65         KASSERT(powerof2(align));
      
   65         if (align != 0 && (*vap & (align - 1)) != 0) {
    5                 if (topdown)
                              *vap = rounddown2(*vap, align);
                      else
    5                         *vap = roundup2(*vap, align);
              }
      }
      
      /*
       * UVM_ET_ISCOMPATIBLE: check some requirements for map entry merging
       */
      extern struct vm_map *pager_map;
      
      #define        UVM_ET_ISCOMPATIBLE(ent, type, uobj, meflags, \
          prot, maxprot, inh, adv, wire) \
              ((ent)->etype == (type) && \
              (((ent)->flags ^ (meflags)) & (UVM_MAP_NOMERGE)) == 0 && \
              (ent)->object.uvm_obj == (uobj) && \
              (ent)->protection == (prot) && \
              (ent)->max_protection == (maxprot) && \
              (ent)->inheritance == (inh) && \
              (ent)->advice == (adv) && \
              (ent)->wired_count == (wire))
      
      /*
       * uvm_map_entry_link: insert entry into a map
       *
       * => map must be locked
       */
      #define uvm_map_entry_link(map, after_where, entry) do { \
              uvm_mapent_check(entry); \
              (map)->nentries++; \
              (entry)->prev = (after_where); \
              (entry)->next = (after_where)->next; \
              (entry)->prev->next = (entry); \
              (entry)->next->prev = (entry); \
              uvm_rb_insert((map), (entry)); \
      } while (/*CONSTCOND*/ 0)
      
      /*
       * uvm_map_entry_unlink: remove entry from a map
       *
       * => map must be locked
       */
      #define uvm_map_entry_unlink(map, entry) do { \
              KASSERT((entry) != (map)->first_free); \
              KASSERT((entry) != (map)->hint); \
              uvm_mapent_check(entry); \
              (map)->nentries--; \
              (entry)->next->prev = (entry)->prev; \
              (entry)->prev->next = (entry)->next; \
              uvm_rb_remove((map), (entry)); \
      } while (/*CONSTCOND*/ 0)
      
      /*
       * SAVE_HINT: saves the specified entry as the hint for future lookups.
       *
       * => map need not be locked.
       */
      #define SAVE_HINT(map, check, value) do { \
              if ((map)->hint == (check)) \
                      (map)->hint = (value); \
      } while (/*CONSTCOND*/ 0)
      
      /*
       * clear_hints: ensure that hints don't point to the entry.
       *
       * => map must be write-locked.
       */
      static void
      clear_hints(struct vm_map *map, struct vm_map_entry *ent)
      {
      
              SAVE_HINT(map, ent, ent->prev);
   33         if (map->first_free == ent) {
                      map->first_free = ent->prev;
              }
      }
      
      /*
       * VM_MAP_RANGE_CHECK: check and correct range
       *
       * => map must at least be read locked
       */
      
      #define VM_MAP_RANGE_CHECK(map, start, end) do { \
              if (start < vm_map_min(map))                \
                      start = vm_map_min(map);        \
              if (end > vm_map_max(map))                \
                      end = vm_map_max(map);                \
              if (start > end)                        \
                      start = end;                        \
      } while (/*CONSTCOND*/ 0)
      
      /*
       * local prototypes
       */
      
      static struct vm_map_entry *
                      uvm_mapent_alloc(struct vm_map *, int);
      static void        uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *);
      static void        uvm_mapent_free(struct vm_map_entry *);
      #if defined(DEBUG)
      static void        _uvm_mapent_check(const struct vm_map_entry *, const char *,
                          int);
      #define        uvm_mapent_check(map)        _uvm_mapent_check(map, __FILE__, __LINE__)
      #else /* defined(DEBUG) */
      #define        uvm_mapent_check(e)        /* nothing */
      #endif /* defined(DEBUG) */
      
      static void        uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *);
      static void        uvm_map_reference_amap(struct vm_map_entry *, int);
      static int        uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
                          int, struct vm_map_entry *);
      static void        uvm_map_unreference_amap(struct vm_map_entry *, int);
      
      int _uvm_map_sanity(struct vm_map *);
      int _uvm_tree_sanity(struct vm_map *);
      static vsize_t uvm_rb_maxgap(const struct vm_map_entry *);
      
      #define        ROOT_ENTRY(map)                ((struct vm_map_entry *)(map)->rb_tree.rbt_root)
      #define        LEFT_ENTRY(entry)        ((struct vm_map_entry *)(entry)->rb_node.rb_left)
      #define        RIGHT_ENTRY(entry)        ((struct vm_map_entry *)(entry)->rb_node.rb_right)
      #define        PARENT_ENTRY(map, entry) \
              (ROOT_ENTRY(map) == (entry) \
                  ? NULL : (struct vm_map_entry *)RB_FATHER(&(entry)->rb_node))
      
      /*
       * These get filled in if/when SYSVSHM shared memory code is loaded
       *
       * We do this with function pointers rather the #ifdef SYSVSHM so the
       * SYSVSHM code can be loaded and unloaded
       */
      void (*uvm_shmexit)(struct vmspace *) = NULL;
      void (*uvm_shmfork)(struct vmspace *, struct vmspace *) = NULL;
      
      static int
      uvm_map_compare_nodes(void *ctx, const void *nparent, const void *nkey)
      {
              const struct vm_map_entry *eparent = nparent;
              const struct vm_map_entry *ekey = nkey;
      
  255         KASSERT(eparent->start < ekey->start || eparent->start >= ekey->end);
  255         KASSERT(ekey->start < eparent->start || ekey->start >= eparent->end);
      
  255         if (eparent->start < ekey->start)
                      return -1;
  255         if (eparent->end >= ekey->start)
                      return 1;
              return 0;
  255 }
      
      static int
      uvm_map_compare_key(void *ctx, const void *nparent, const void *vkey)
      {
              const struct vm_map_entry *eparent = nparent;
              const vaddr_t va = *(const vaddr_t *) vkey;
      
              if (eparent->start < va)
                      return -1;
              if (eparent->end >= va)
                      return 1;
              return 0;
      }
      
      static const rb_tree_ops_t uvm_map_tree_ops = {
              .rbto_compare_nodes = uvm_map_compare_nodes,
              .rbto_compare_key = uvm_map_compare_key,
              .rbto_node_offset = offsetof(struct vm_map_entry, rb_node),
              .rbto_context = NULL
      };
      
      /*
       * uvm_rb_gap: return the gap size between our entry and next entry.
       */
      static inline vsize_t
      uvm_rb_gap(const struct vm_map_entry *entry)
      {
      
  294         KASSERT(entry->next != NULL);
  300         return entry->next->start - entry->end;
      }
      
      static vsize_t
      uvm_rb_maxgap(const struct vm_map_entry *entry)
      {
              struct vm_map_entry *child;
  300         vsize_t maxgap = entry->gap;
      
              /*
               * We need maxgap to be the largest gap of us or any of our
               * descendents.  Since each of our children's maxgap is the
               * cached value of their largest gap of themselves or their
               * descendents, we can just use that value and avoid recursing
               * down the tree to calculate it.
               */
  276         if ((child = LEFT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
                      maxgap = child->maxgap;
      
  300         if ((child = RIGHT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
                      maxgap = child->maxgap;
      
              return maxgap;
      }
      
      static void
      uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry)
      {
              struct vm_map_entry *parent;
      
  300         KASSERT(entry->gap == uvm_rb_gap(entry));
  300         entry->maxgap = uvm_rb_maxgap(entry);
      
  280         while ((parent = PARENT_ENTRY(map, entry)) != NULL) {
                      struct vm_map_entry *brother;
  280                 vsize_t maxgap = parent->gap;
                      unsigned int which;
      
  280                 KDASSERT(parent->gap == uvm_rb_gap(parent));
  280                 if (maxgap < entry->maxgap)
                              maxgap = entry->maxgap;
                      /*
                       * Since we work towards the root, we know entry's maxgap
                       * value is OK, but its brothers may now be out-of-date due
                       * to rebalancing.  So refresh it.
                       */
                      which = RB_POSITION(&entry->rb_node) ^ RB_DIR_OTHER;
                      brother = (struct vm_map_entry *)parent->rb_node.rb_nodes[which];
                      if (brother != NULL) {
  276                         KDASSERT(brother->gap == uvm_rb_gap(brother));
  276                         brother->maxgap = uvm_rb_maxgap(brother);
                              if (maxgap < brother->maxgap)
                                      maxgap = brother->maxgap;
                      }
      
  280                 parent->maxgap = maxgap;
                      entry = parent;
              }
      }
      
      static void
      uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry)
      {
              struct vm_map_entry *ret __diagused;
      
  277         entry->gap = entry->maxgap = uvm_rb_gap(entry);
              if (entry->prev != &map->header)
  255                 entry->prev->gap = uvm_rb_gap(entry->prev);
      
  277         ret = rb_tree_insert_node(&map->rb_tree, entry);
              KASSERTMSG(ret == entry,
                  "uvm_rb_insert: map %p: duplicate entry %p", map, ret);
      
              /*
               * If the previous entry is not our immediate left child, then it's an
               * ancestor and will be fixed up on the way to the root.  We don't
               * have to check entry->prev against &map->header since &map->header
               * will never be in the tree.
               */
  277         uvm_rb_fixup(map,
  277             LEFT_ENTRY(entry) == entry->prev ? entry->prev : entry);
      }
      
      static void
      uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry)
      {
              struct vm_map_entry *prev_parent = NULL, *next_parent = NULL;
      
              /*
               * If we are removing an interior node, then an adjacent node will
               * be used to replace its position in the tree.  Therefore we will
               * need to fixup the tree starting at the parent of the replacement
               * node.  So record their parents for later use.
               */
  130         if (entry->prev != &map->header)
  114                 prev_parent = PARENT_ENTRY(map, entry->prev);
  130         if (entry->next != &map->header)
  109                 next_parent = PARENT_ENTRY(map, entry->next);
      
  130         rb_tree_remove_node(&map->rb_tree, entry);
      
              /*
               * If the previous node has a new parent, fixup the tree starting
               * at the previous node's old parent.
               */
              if (entry->prev != &map->header) {
                      /*
                       * Update the previous entry's gap due to our absence.
                       */
  114                 entry->prev->gap = uvm_rb_gap(entry->prev);
                      uvm_rb_fixup(map, entry->prev);
                      if (prev_parent != NULL
  112                     && prev_parent != entry
   90                     && prev_parent != PARENT_ENTRY(map, entry->prev))
    7                         uvm_rb_fixup(map, prev_parent);
              }
      
              /*
               * If the next node has a new parent, fixup the tree starting
               * at the next node's old parent.
               */
  130         if (entry->next != &map->header) {
  109                 uvm_rb_fixup(map, entry->next);
                      if (next_parent != NULL
  109                     && next_parent != entry
   85                     && next_parent != PARENT_ENTRY(map, entry->next))
   17                         uvm_rb_fixup(map, next_parent);
              }
      }
      
      #if defined(DEBUG)
      int uvm_debug_check_map = 0;
      int uvm_debug_check_rbtree = 0;
      #define uvm_map_check(map, name) \
              _uvm_map_check((map), (name), __FILE__, __LINE__)
      static void
      _uvm_map_check(struct vm_map *map, const char *name,
          const char *file, int line)
      {
      
  413         if ((uvm_debug_check_map && _uvm_map_sanity(map)) ||
  413             (uvm_debug_check_rbtree && _uvm_tree_sanity(map))) {
                      panic("uvm_map_check failed: \"%s\" map=%p (%s:%d)",
                          name, map, file, line);
              }
      }
      #else /* defined(DEBUG) */
      #define uvm_map_check(map, name)        /* nothing */
      #endif /* defined(DEBUG) */
      
      #if defined(DEBUG) || defined(DDB)
      int
      _uvm_map_sanity(struct vm_map *map)
      {
              bool first_free_found = false;
              bool hint_found = false;
              const struct vm_map_entry *e;
              struct vm_map_entry *hint = map->hint;
      
              e = &map->header;
              for (;;) {
                      if (map->first_free == e) {
                              first_free_found = true;
                      } else if (!first_free_found && e->next->start > e->end) {
                              printf("first_free %p should be %p\n",
                                  map->first_free, e);
                              return -1;
                      }
                      if (hint == e) {
                              hint_found = true;
                      }
      
                      e = e->next;
                      if (e == &map->header) {
                              break;
                      }
              }
              if (!first_free_found) {
                      printf("stale first_free\n");
                      return -1;
              }
              if (!hint_found) {
                      printf("stale hint\n");
                      return -1;
              }
              return 0;
      }
      
      int
      _uvm_tree_sanity(struct vm_map *map)
      {
              struct vm_map_entry *tmp, *trtmp;
              int n = 0, i = 1;
      
              for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
                      if (tmp->gap != uvm_rb_gap(tmp)) {
                              printf("%d/%d gap %#lx != %#lx %s\n",
                                  n + 1, map->nentries,
                                  (ulong)tmp->gap, (ulong)uvm_rb_gap(tmp),
                                  tmp->next == &map->header ? "(last)" : "");
                              goto error;
                      }
                      /*
                       * If any entries are out of order, tmp->gap will be unsigned
                       * and will likely exceed the size of the map.
                       */
                      if (tmp->gap >= vm_map_max(map) - vm_map_min(map)) {
                              printf("too large gap %zu\n", (size_t)tmp->gap);
                              goto error;
                      }
                      n++;
              }
      
              if (n != map->nentries) {
                      printf("nentries: %d vs %d\n", n, map->nentries);
                      goto error;
              }
      
              trtmp = NULL;
              for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
                      if (tmp->maxgap != uvm_rb_maxgap(tmp)) {
                              printf("maxgap %#lx != %#lx\n",
                                  (ulong)tmp->maxgap,
                                  (ulong)uvm_rb_maxgap(tmp));
                              goto error;
                      }
                      if (trtmp != NULL && trtmp->start >= tmp->start) {
                              printf("corrupt: 0x%"PRIxVADDR"x >= 0x%"PRIxVADDR"x\n",
                                  trtmp->start, tmp->start);
                              goto error;
                      }
      
                      trtmp = tmp;
              }
      
              for (tmp = map->header.next; tmp != &map->header;
                  tmp = tmp->next, i++) {
                      trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_LEFT);
                      if (trtmp == NULL)
                              trtmp = &map->header;
                      if (tmp->prev != trtmp) {
                              printf("lookup: %d: %p->prev=%p: %p\n",
                                  i, tmp, tmp->prev, trtmp);
                              goto error;
                      }
                      trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_RIGHT);
                      if (trtmp == NULL)
                              trtmp = &map->header;
                      if (tmp->next != trtmp) {
                              printf("lookup: %d: %p->next=%p: %p\n",
                                  i, tmp, tmp->next, trtmp);
                              goto error;
                      }
                      trtmp = rb_tree_find_node(&map->rb_tree, &tmp->start);
                      if (trtmp != tmp) {
                              printf("lookup: %d: %p - %p: %p\n", i, tmp, trtmp,
                                  PARENT_ENTRY(map, tmp));
                              goto error;
                      }
              }
      
              return (0);
       error:
              return (-1);
      }
      #endif /* defined(DEBUG) || defined(DDB) */
      
      /*
       * vm_map_lock: acquire an exclusive (write) lock on a map.
       *
       * => The locking protocol provides for guaranteed upgrade from shared ->
       *    exclusive by whichever thread currently has the map marked busy.
       *    See "LOCKING PROTOCOL NOTES" in uvm_map.h.  This is horrible; among
       *    other problems, it defeats any fairness guarantees provided by RW
       *    locks.
       */
      
      void
      vm_map_lock(struct vm_map *map)
  272 {
      
              for (;;) {
  272                 rw_enter(&map->lock, RW_WRITER);
    4                 if (map->busy == NULL || map->busy == curlwp) {
                              break;
                      }
    1                 mutex_enter(&map->misc_lock);
                      rw_exit(&map->lock);
                      if (map->busy != NULL) {
    1                         cv_wait(&map->cv, &map->misc_lock);
                      }
    1                 mutex_exit(&map->misc_lock);
              }
  272         map->timestamp++;
      }
      
      /*
       * vm_map_lock_try: try to lock a map, failing if it is already locked.
       */
      
      bool
      vm_map_lock_try(struct vm_map *map)
      {
      
  148         if (!rw_tryenter(&map->lock, RW_WRITER)) {
    8                 return false;
              }
  148         if (map->busy != NULL) {
                      rw_exit(&map->lock);
                      return false;
              }
  148         map->timestamp++;
  148         return true;
      }
      
      /*
       * vm_map_unlock: release an exclusive lock on a map.
       */
      
      void
      vm_map_unlock(struct vm_map *map)
      {
      
  312         KASSERT(rw_write_held(&map->lock));
  312         KASSERT(map->busy == NULL || map->busy == curlwp);
  312         rw_exit(&map->lock);
      }
      
      /*
       * vm_map_unbusy: mark the map as unbusy, and wake any waiters that
       *     want an exclusive lock.
       */
      
      void
      vm_map_unbusy(struct vm_map *map)
      {
      
   35         KASSERT(map->busy == curlwp);
      
              /*
               * Safe to clear 'busy' and 'waiters' with only a read lock held:
               *
               * o they can only be set with a write lock held
               * o writers are blocked out with a read or write hold
               * o at any time, only one thread owns the set of values
               */
   35         mutex_enter(&map->misc_lock);
              map->busy = NULL;
              cv_broadcast(&map->cv);
              mutex_exit(&map->misc_lock);
      }
      
      /*
       * vm_map_lock_read: acquire a shared (read) lock on a map.
       */
      
      void
      vm_map_lock_read(struct vm_map *map)
      {
      
  351         rw_enter(&map->lock, RW_READER);
      }
      
      /*
       * vm_map_unlock_read: release a shared lock on a map.
       */
      
      void
      vm_map_unlock_read(struct vm_map *map)
      {
      
  353         rw_exit(&map->lock);
      }
      
      /*
       * vm_map_busy: mark a map as busy.
       *
       * => the caller must hold the map write locked
       */
      
      void
      vm_map_busy(struct vm_map *map)
      {
      
   36         KASSERT(rw_write_held(&map->lock));
   36         KASSERT(map->busy == NULL);
      
   36         map->busy = curlwp;
      }
      
      /*
       * vm_map_locked_p: return true if the map is write locked.
       *
       * => only for debug purposes like KASSERTs.
       * => should not be used to verify that a map is not locked.
       */
      
      bool
      vm_map_locked_p(struct vm_map *map)
      {
      
  146         return rw_write_held(&map->lock);
      }
      
      /*
       * uvm_mapent_alloc: allocate a map entry
       */
      
      static struct vm_map_entry *
      uvm_mapent_alloc(struct vm_map *map, int flags)
      {
              struct vm_map_entry *me;
  117         int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK;
              UVMHIST_FUNC("uvm_mapent_alloc"); UVMHIST_CALLED(maphist);
      
  176         me = pool_cache_get(&uvm_map_entry_cache, pflags);
              if (__predict_false(me == NULL)) {
                      return NULL;
              }
  277         me->flags = 0;
      
              UVMHIST_LOG(maphist, "<- new entry=%#jx [kentry=%jd]", (uintptr_t)me,
                  (map == kernel_map), 0, 0);
  117         return me;
      }
      
      /*
       * uvm_mapent_free: free map entry
       */
      
      static void
      uvm_mapent_free(struct vm_map_entry *me)
      {
              UVMHIST_FUNC("uvm_mapent_free"); UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist,"<- freeing map entry=%#jx [flags=%jd]",
                      (uintptr_t)me, me->flags, 0, 0);
  148         pool_cache_put(&uvm_map_entry_cache, me);
      }
      
      /*
       * uvm_mapent_copy: copy a map entry, preserving flags
       */
      
      static inline void
      uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
      {
      
  231         memcpy(dst, src, ((char *)&src->uvm_map_entry_stop_copy) -
                  ((char *)src));
      }
      
      #if defined(DEBUG)
      static void
      _uvm_mapent_check(const struct vm_map_entry *entry, const char *file, int line)
      {
      
  389         if (entry->start >= entry->end) {
                      goto bad;
              }
  389         if (UVM_ET_ISOBJ(entry)) {
  212                 if (entry->object.uvm_obj == NULL) {
                              goto bad;
                      }
  319         } else if (UVM_ET_ISSUBMAP(entry)) {
                      if (entry->object.sub_map == NULL) {
                              goto bad;
                      }
              } else {
  319                 if (entry->object.uvm_obj != NULL ||
                          entry->object.sub_map != NULL) {
                              goto bad;
                      }
              }
              if (!UVM_ET_ISOBJ(entry)) {
  319                 if (entry->offset != 0) {
                              goto bad;
                      }
              }
      
              return;
      
      bad:
              panic("%s: bad entry %p (%s:%d)", __func__, entry, file, line);
      }
      #endif /* defined(DEBUG) */
      
      /*
       * uvm_map_entry_unwire: unwire a map entry
       *
       * => map should be locked by caller
       */
      
      static inline void
      uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry)
      {
      
   17         entry->wired_count = 0;
              uvm_fault_unwire_locked(map, entry->start, entry->end);
      }
      
      
      /*
       * wrapper for calling amap_ref()
       */
      static inline void
      uvm_map_reference_amap(struct vm_map_entry *entry, int flags)
      {
      
              amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff,
   65             (entry->end - entry->start) >> PAGE_SHIFT, flags);
      }
      
      
      /*
       * wrapper for calling amap_unref()
       */
      static inline void
      uvm_map_unreference_amap(struct vm_map_entry *entry, int flags)
      {
      
   37         amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff,
                  (entry->end - entry->start) >> PAGE_SHIFT, flags);
      }
      
      
      /*
       * uvm_map_init: init mapping system at boot time.
       */
      
      void
      uvm_map_init(void)
      {
      #if defined(UVMHIST)
              static struct kern_history_ent pdhistbuf[UVMHIST_PDHIST_SIZE];
      #endif
      
              /*
               * first, init logging system.
               */
      
              UVMHIST_FUNC("uvm_map_init");
              UVMHIST_LINK_STATIC(maphist);
              UVMHIST_INIT_STATIC(pdhist, pdhistbuf);
              UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0);
      
              /*
               * initialize the global lock for kernel map entry.
               */
      
              mutex_init(&uvm_kentry_lock, MUTEX_DRIVER, IPL_VM);
      }
      
      /*
       * uvm_map_init_caches: init mapping system caches.
       */
      void
      uvm_map_init_caches(void)
      {
              /*
               * initialize caches.
               */
      
              pool_cache_bootstrap(&uvm_map_entry_cache, sizeof(struct vm_map_entry),
                  0, 0, 0, "vmmpepl", NULL, IPL_NONE, NULL, NULL, NULL);
              pool_cache_bootstrap(&uvm_vmspace_cache, sizeof(struct vmspace),
                  0, 0, 0, "vmsppl", NULL, IPL_NONE, NULL, NULL, NULL);
      }
      
      /*
       * clippers
       */
      
      /*
       * uvm_mapent_splitadj: adjust map entries for splitting, after uvm_mapent_copy.
       */
      
      static void
      uvm_mapent_splitadj(struct vm_map_entry *entry1, struct vm_map_entry *entry2,
          vaddr_t splitat)
      {
              vaddr_t adj;
      
  175         KASSERT(entry1->start < splitat);
  175         KASSERT(splitat < entry1->end);
      
  175         adj = splitat - entry1->start;
              entry1->end = entry2->start = splitat;
      
              if (entry1->aref.ar_amap) {
   48                 amap_splitref(&entry1->aref, &entry2->aref, adj);
              }
  175         if (UVM_ET_ISSUBMAP(entry1)) {
                      /* ... unlikely to happen, but play it safe */
                       uvm_map_reference(entry1->object.sub_map);
  175         } else if (UVM_ET_ISOBJ(entry1)) {
   21                 KASSERT(entry1->object.uvm_obj != NULL); /* suppress coverity */
   21                 entry2->offset += adj;
                      if (entry1->object.uvm_obj->pgops &&
   21                     entry1->object.uvm_obj->pgops->pgo_reference)
   21                         entry1->object.uvm_obj->pgops->pgo_reference(
                                  entry1->object.uvm_obj);
              }
      }
      
      /*
       * uvm_map_clip_start: ensure that the entry begins at or after
       *        the starting address, if it doesn't we split the entry.
       *
       * => caller should use UVM_MAP_CLIP_START macro rather than calling
       *    this directly
       * => map must be locked by caller
       */
      
      void
      uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry,
          vaddr_t start)
      {
              struct vm_map_entry *new_entry;
      
              /* uvm_map_simplify_entry(map, entry); */ /* XXX */
      
   55         uvm_map_check(map, "clip_start entry");
              uvm_mapent_check(entry);
      
              /*
               * Split off the front portion.  note that we must insert the new
               * entry BEFORE this one, so that this entry has the specified
               * starting address.
               */
   55         new_entry = uvm_mapent_alloc(map, 0);
   55         uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
              uvm_mapent_splitadj(new_entry, entry, start);
              uvm_map_entry_link(map, entry->prev, new_entry);
      
              uvm_map_check(map, "clip_start leave");
      }
      
      /*
       * uvm_map_clip_end: ensure that the entry ends at or before
       *        the ending address, if it does't we split the reference
       *
       * => caller should use UVM_MAP_CLIP_END macro rather than calling
       *    this directly
       * => map must be locked by caller
       */
      
      void
      uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end)
      {
              struct vm_map_entry *new_entry;
      
  160         uvm_map_check(map, "clip_end entry");
              uvm_mapent_check(entry);
      
              /*
               *        Create a new entry and insert it
               *        AFTER the specified entry
               */
  160         new_entry = uvm_mapent_alloc(map, 0);
  160         uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
              uvm_mapent_splitadj(entry, new_entry, end);
              uvm_map_entry_link(map, entry, new_entry);
      
              uvm_map_check(map, "clip_end leave");
      }
      
      /*
       *   M A P   -   m a i n   e n t r y   p o i n t
       */
      /*
       * uvm_map: establish a valid mapping in a map
       *
       * => assume startp is page aligned.
       * => assume size is a multiple of PAGE_SIZE.
       * => assume sys_mmap provides enough of a "hint" to have us skip
       *        over text/data/bss area.
       * => map must be unlocked (we will lock it)
       * => <uobj,uoffset> value meanings (4 cases):
       *         [1] <NULL,uoffset>                == uoffset is a hint for PMAP_PREFER
       *         [2] <NULL,UVM_UNKNOWN_OFFSET>        == don't PMAP_PREFER
       *         [3] <uobj,uoffset>                == normal mapping
       *         [4] <uobj,UVM_UNKNOWN_OFFSET>        == uvm_map finds offset based on VA
       *
       *    case [4] is for kernel mappings where we don't know the offset until
       *    we've found a virtual address.   note that kernel object offsets are
       *    always relative to vm_map_min(kernel_map).
       *
       * => if `align' is non-zero, we align the virtual address to the specified
       *        alignment.
       *        this is provided as a mechanism for large pages.
       *
       * => XXXCDC: need way to map in external amap?
       */
      
      int
      uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size,
          struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags)
      {
  148         struct uvm_map_args args;
              struct vm_map_entry *new_entry;
              int error;
      
              KASSERT((size & PAGE_MASK) == 0);
  148         KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
      
              /*
               * for pager_map, allocate the new entry first to avoid sleeping
               * for memory while we have the map locked.
               */
      
              new_entry = NULL;
  148         if (map == pager_map) {
   45                 new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
                      if (__predict_false(new_entry == NULL))
                              return ENOMEM;
              }
   45         if (map == pager_map)
   45                 flags |= UVM_FLAG_NOMERGE;
      
  148         error = uvm_map_prepare(map, *startp, size, uobj, uoffset, align,
                  flags, &args);
              if (!error) {
  146                 error = uvm_map_enter(map, &args, new_entry);
                      *startp = args.uma_start;
    3         } else if (new_entry) {
                      uvm_mapent_free(new_entry);
              }
      
      #if defined(DEBUG)
  146         if (!error && VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
  103                 uvm_km_check_empty(map, *startp, *startp + size);
              }
      #endif /* defined(DEBUG) */
      
              return error;
      }
      
      /*
       * uvm_map_prepare:
       *
       * called with map unlocked.
       * on success, returns the map locked.
       */
      
      int
      uvm_map_prepare(struct vm_map *map, vaddr_t start, vsize_t size,
          struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags,
          struct uvm_map_args *args)
      {
  148         struct vm_map_entry *prev_entry;
              vm_prot_t prot = UVM_PROTECTION(flags);
              vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
      
              UVMHIST_FUNC("uvm_map_prepare");
              UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)",
                  (uintptr_t)map, start, size, flags);
              UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
                  uoffset,0,0);
      
              /*
               * detect a popular device driver bug.
               */
      
  148         KASSERT(doing_shutdown || curlwp != NULL);
      
              /*
               * zero-sized mapping doesn't make any sense.
               */
  148         KASSERT(size > 0);
      
  148         KASSERT((~flags & (UVM_FLAG_NOWAIT | UVM_FLAG_WAITVA)) != 0);
      
  148         uvm_map_check(map, "map entry");
      
              /*
               * check sanity of protection code
               */
      
              if ((prot & maxprot) != prot) {
                      UVMHIST_LOG(maphist, "<- prot. failure:  prot=%#jx, max=%#jx",
                      prot, maxprot,0,0);
                      return EACCES;
              }
      
              /*
               * figure out where to put new VM range
               */
  148 retry:
              if (vm_map_lock_try(map) == false) {
    8                 if ((flags & UVM_FLAG_TRYLOCK) != 0) {
                              return EAGAIN;
                      }
    8                 vm_map_lock(map); /* could sleep here */
              }
  148         if (flags & UVM_FLAG_UNMAP) {
   45                 KASSERT(flags & UVM_FLAG_FIXED);
   45                 KASSERT((flags & UVM_FLAG_NOWAIT) == 0);
      
                      /*
                       * Set prev_entry to what it will need to be after any existing
                       * entries are removed later in uvm_map_enter().
                       */
      
   45                 if (uvm_map_lookup_entry(map, start, &prev_entry)) {
   44                         if (start == prev_entry->start)
   23                                 prev_entry = prev_entry->prev;
                              else
   23                                 UVM_MAP_CLIP_END(map, prev_entry, start);
   44                         SAVE_HINT(map, map->hint, prev_entry);
                      }
              } else {
  114                 prev_entry = uvm_map_findspace(map, start, size, &start,
                          uobj, uoffset, align, flags);
              }
  148         if (prev_entry == NULL) {
                      unsigned int timestamp;
      
    3                 timestamp = map->timestamp;
                      UVMHIST_LOG(maphist,"waiting va timestamp=%#jx",
                                  timestamp,0,0,0);
                      map->flags |= VM_MAP_WANTVA;
                      vm_map_unlock(map);
      
                      /*
                       * try to reclaim kva and wait until someone does unmap.
                       * fragile locking here, so we awaken every second to
                       * recheck the condition.
                       */
      
                      mutex_enter(&map->misc_lock);
                      while ((map->flags & VM_MAP_WANTVA) != 0 &&
    3                    map->timestamp == timestamp) {
    3                         if ((flags & UVM_FLAG_WAITVA) == 0) {
    3                                 mutex_exit(&map->misc_lock);
                                      UVMHIST_LOG(maphist,
                                          "<- uvm_map_findspace failed!", 0,0,0,0);
  148                                 return ENOMEM;
                              } else {
                                      cv_timedwait(&map->cv, &map->misc_lock, hz);
                              }
                      }
                      mutex_exit(&map->misc_lock);
                      goto retry;
              }
      
      #ifdef PMAP_GROWKERNEL
              /*
               * If the kernel pmap can't map the requested space,
               * then allocate more resources for it.
               */
  146         if (map == kernel_map && uvm_maxkaddr < (start + size))
                      uvm_maxkaddr = pmap_growkernel(start + size);
      #endif
      
              UVMMAP_EVCNT_INCR(map_call);
      
              /*
               * if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER
               * [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET.   in
               * either case we want to zero it  before storing it in the map entry
               * (because it looks strange and confusing when debugging...)
               *
               * if uobj is not null
               *   if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping
               *      and we do not need to change uoffset.
               *   if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset
               *      now (based on the starting address of the map).   this case is
               *      for kernel object mappings where we don't know the offset until
               *      the virtual address is found (with uvm_map_findspace).   the
               *      offset is the distance we are from the start of the map.
               */
      
  146         if (uobj == NULL) {
                      uoffset = 0;
              } else {
   66                 if (uoffset == UVM_UNKNOWN_OFFSET) {
   42                         KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
   42                         uoffset = start - vm_map_min(kernel_map);
                      }
              }
      
  146         args->uma_flags = flags;
              args->uma_prev = prev_entry;
              args->uma_start = start;
              args->uma_size = size;
              args->uma_uobj = uobj;
              args->uma_uoffset = uoffset;
      
              UVMHIST_LOG(maphist, "<- done!", 0,0,0,0);
   90         return 0;
      }
      
      /*
       * uvm_map_enter:
       *
       * called with map locked.
       * unlock the map before returning.
       */
      
      int
      uvm_map_enter(struct vm_map *map, const struct uvm_map_args *args,
          struct vm_map_entry *new_entry)
      {
  146         struct vm_map_entry *prev_entry = args->uma_prev;
              struct vm_map_entry *dead = NULL, *dead_entries = NULL;
      
              const uvm_flag_t flags = args->uma_flags;
              const vm_prot_t prot = UVM_PROTECTION(flags);
              const vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
              const vm_inherit_t inherit = UVM_INHERIT(flags);
              const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ?
  107             AMAP_EXTEND_NOWAIT : 0;
              const int advice = UVM_ADVICE(flags);
      
              vaddr_t start = args->uma_start;
              vsize_t size = args->uma_size;
              struct uvm_object *uobj = args->uma_uobj;
              voff_t uoffset = args->uma_uoffset;
      
              const int kmap = (vm_map_pmap(map) == pmap_kernel());
              int merged = 0;
              int error;
              int newetype;
      
              UVMHIST_FUNC("uvm_map_enter");
              UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)",
                  (uintptr_t)map, start, size, flags);
              UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
                  uoffset,0,0);
      
              KASSERT(map->hint == prev_entry); /* bimerge case assumes this */
  146         KASSERT(vm_map_locked_p(map));
  146         KASSERT((flags & (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)) !=
                      (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP));
      
  146         if (uobj)
                      newetype = UVM_ET_OBJ;
              else
                      newetype = 0;
      
              if (flags & UVM_FLAG_COPYONW) {
                      newetype |= UVM_ET_COPYONWRITE;
   41                 if ((flags & UVM_FLAG_OVERLAY) == 0)
   41                         newetype |= UVM_ET_NEEDSCOPY;
              }
      
              /*
               * For mappings with unmap, remove any old entries now.  Adding the new
               * entry cannot fail because that can only happen if UVM_FLAG_NOWAIT
               * is set, and we do not support nowait and unmap together.
               */
      
  146         if (flags & UVM_FLAG_UNMAP) {
   45                 KASSERT(flags & UVM_FLAG_FIXED);
   45                 uvm_unmap_remove(map, start, start + size, &dead_entries, 0);
      #ifdef DEBUG
                      struct vm_map_entry *tmp_entry __diagused;
                      bool rv __diagused;
      
                      rv = uvm_map_lookup_entry(map, start, &tmp_entry);
                      KASSERT(!rv);
   45                 KASSERTMSG(prev_entry == tmp_entry,
                                 "args %p prev_entry %p tmp_entry %p",
                                 args, prev_entry, tmp_entry);
      #endif
   45                 SAVE_HINT(map, map->hint, prev_entry);
              }
      
              /*
               * try and insert in map by extending previous entry, if possible.
               * XXX: we don't try and pull back the next entry.   might be useful
               * for a stack, but we are currently allocating our stack in advance.
               */
      
  146         if (flags & UVM_FLAG_NOMERGE)
                      goto nomerge;
      
              if (prev_entry->end == start &&
   91             prev_entry != &map->header &&
   91             UVM_ET_ISCOMPATIBLE(prev_entry, newetype, uobj, 0,
                  prot, maxprot, inherit, advice, 0)) {
      
   50                 if (uobj && prev_entry->offset +
                          (prev_entry->end - prev_entry->start) != uoffset)
                              goto forwardmerge;
      
                      /*
                       * can't extend a shared amap.  note: no need to lock amap to
                       * look at refs since we don't care about its exact value.
                       * if it is one (i.e. we have only reference) it will stay there
                       */
      
   49                 if (prev_entry->aref.ar_amap &&
    1                     amap_refs(prev_entry->aref.ar_amap) != 1) {
                              goto forwardmerge;
                      }
      
                      if (prev_entry->aref.ar_amap) {
                              error = amap_extend(prev_entry, size,
                                  amapwaitflag | AMAP_EXTEND_FORWARDS);
                              if (error)
                                      goto nomerge;
                      }
      
                      if (kmap) {
                              UVMMAP_EVCNT_INCR(kbackmerge);
                      } else {
                              UVMMAP_EVCNT_INCR(ubackmerge);
                      }
                      UVMHIST_LOG(maphist,"  starting back merge", 0, 0, 0, 0);
      
                      /*
                       * drop our reference to uobj since we are extending a reference
                       * that we already have (the ref count can not drop to zero).
                       */
      
   27                 if (uobj && uobj->pgops->pgo_detach)
   27                         uobj->pgops->pgo_detach(uobj);
      
                      /*
                       * Now that we've merged the entries, note that we've grown
                       * and our gap has shrunk.  Then fix the tree.
                       */
   48                 prev_entry->end += size;
                      prev_entry->gap -= size;
                      uvm_rb_fixup(map, prev_entry);
      
                      uvm_map_check(map, "map backmerged");
      
                      UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0);
                      merged++;
              }
      
   78 forwardmerge:
   77         if (prev_entry->next->start == (start + size) &&
                  prev_entry->next != &map->header &&
   77             UVM_ET_ISCOMPATIBLE(prev_entry->next, newetype, uobj, 0,
                  prot, maxprot, inherit, advice, 0)) {
      
   13                 if (uobj && prev_entry->next->offset != uoffset + size)
                              goto nomerge;
      
                      /*
                       * can't extend a shared amap.  note: no need to lock amap to
                       * look at refs since we don't care about its exact value.
                       * if it is one (i.e. we have only reference) it will stay there.
                       *
                       * note that we also can't merge two amaps, so if we
                       * merged with the previous entry which has an amap,
                       * and the next entry also has an amap, we give up.
                       *
                       * Interesting cases:
                       * amap, new, amap -> give up second merge (single fwd extend)
                       * amap, new, none -> double forward extend (extend again here)
                       * none, new, amap -> double backward extend (done here)
                       * uobj, new, amap -> single backward extend (done here)
                       *
                       * XXX should we attempt to deal with someone refilling
                       * the deallocated region between two entries that are
                       * backed by the same amap (ie, arefs is 2, "prev" and
                       * "next" refer to it, and adding this allocation will
                       * close the hole, thus restoring arefs to 1 and
                       * deallocating the "next" vm_map_entry)?  -- @@@
                       */
      
   13                 if (prev_entry->next->aref.ar_amap &&
    1                     (amap_refs(prev_entry->next->aref.ar_amap) != 1 ||
                           (merged && prev_entry->aref.ar_amap))) {
                              goto nomerge;
                      }
      
   12                 if (merged) {
                              /*
                               * Try to extend the amap of the previous entry to
                               * cover the next entry as well.  If it doesn't work
                               * just skip on, don't actually give up, since we've
                               * already completed the back merge.
                               */
    9                         if (prev_entry->aref.ar_amap) {
                                      if (amap_extend(prev_entry,
                                          prev_entry->next->end -
                                          prev_entry->next->start,
                                          amapwaitflag | AMAP_EXTEND_FORWARDS))
                                              goto nomerge;
                              }
      
                              /*
                               * Try to extend the amap of the *next* entry
                               * back to cover the new allocation *and* the
                               * previous entry as well (the previous merge
                               * didn't have an amap already otherwise we
                               * wouldn't be checking here for an amap).  If
                               * it doesn't work just skip on, again, don't
                               * actually give up, since we've already
                               * completed the back merge.
                               */
                              else if (prev_entry->next->aref.ar_amap) {
                                      if (amap_extend(prev_entry->next,
                                          prev_entry->end -
                                          prev_entry->start,
                                          amapwaitflag | AMAP_EXTEND_BACKWARDS))
                                              goto nomerge;
                              }
                      } else {
                              /*
                               * Pull the next entry's amap backwards to cover this
                               * new allocation.
                               */
                              if (prev_entry->next->aref.ar_amap) {
                                      error = amap_extend(prev_entry->next, size,
                                          amapwaitflag | AMAP_EXTEND_BACKWARDS);
                                      if (error)
                                              goto nomerge;
                              }
                      }
      
                      if (merged) {
                              if (kmap) {
                                      UVMMAP_EVCNT_DECR(kbackmerge);
                                      UVMMAP_EVCNT_INCR(kbimerge);
                              } else {
                                      UVMMAP_EVCNT_DECR(ubackmerge);
                                      UVMMAP_EVCNT_INCR(ubimerge);
                              }
                      } else {
                              if (kmap) {
                                      UVMMAP_EVCNT_INCR(kforwmerge);
                              } else {
                                      UVMMAP_EVCNT_INCR(uforwmerge);
                              }
                      }
                      UVMHIST_LOG(maphist,"  starting forward merge", 0, 0, 0, 0);
      
                      /*
                       * drop our reference to uobj since we are extending a reference
                       * that we already have (the ref count can not drop to zero).
                       */
   12                 if (uobj && uobj->pgops->pgo_detach)
    7                         uobj->pgops->pgo_detach(uobj);
      
   10                 if (merged) {
                              dead = prev_entry->next;
                              prev_entry->end = dead->end;
    9                         uvm_map_entry_unlink(map, dead);
                              if (dead->aref.ar_amap != NULL) {
                                      prev_entry->aref = dead->aref;
                                      dead->aref.ar_amap = NULL;
                              }
                      } else {
    4                         prev_entry->next->start -= size;
                              if (prev_entry != &map->header) {
    4                                 prev_entry->gap -= size;
    4                                 KASSERT(prev_entry->gap == uvm_rb_gap(prev_entry));
    4                                 uvm_rb_fixup(map, prev_entry);
                              }
    4                         if (uobj)
    2                                 prev_entry->next->offset = uoffset;
                      }
      
   12                 uvm_map_check(map, "map forwardmerged");
      
                      UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0);
                      merged++;
              }
      
  146 nomerge:
              if (!merged) {
                      UVMHIST_LOG(maphist,"  allocating new map entry", 0, 0, 0, 0);
                      if (kmap) {
                              UVMMAP_EVCNT_INCR(knomerge);
                      } else {
                              UVMMAP_EVCNT_INCR(unomerge);
                      }
      
                      /*
                       * allocate new entry and link it in.
                       */
      
   75                 if (new_entry == NULL) {
   79                         new_entry = uvm_mapent_alloc(map,
                                      (flags & UVM_FLAG_NOWAIT));
                              if (__predict_false(new_entry == NULL)) {
                                      error = ENOMEM;
                                      goto done;
                              }
                      }
  117                 new_entry->start = start;
                      new_entry->end = new_entry->start + size;
                      new_entry->object.uvm_obj = uobj;
                      new_entry->offset = uoffset;
      
                      new_entry->etype = newetype;
      
                      if (flags & UVM_FLAG_NOMERGE) {
   50                         new_entry->flags |= UVM_MAP_NOMERGE;
                      }
      
  117                 new_entry->protection = prot;
                      new_entry->max_protection = maxprot;
                      new_entry->inheritance = inherit;
                      new_entry->wired_count = 0;
                      new_entry->advice = advice;
                      if (flags & UVM_FLAG_OVERLAY) {
      
                              /*
                               * to_add: for BSS we overallocate a little since we
                               * are likely to extend
                               */
      
    2                         vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ?
                                      UVM_AMAP_CHUNK << PAGE_SHIFT : 0;
                              struct vm_amap *amap = amap_alloc(size, to_add,
                                  (flags & UVM_FLAG_NOWAIT));
                              if (__predict_false(amap == NULL)) {
                                      error = ENOMEM;
                                      goto done;
                              }
    2                         new_entry->aref.ar_pageoff = 0;
                              new_entry->aref.ar_amap = amap;
                      } else {
  115                         new_entry->aref.ar_pageoff = 0;
                              new_entry->aref.ar_amap = NULL;
                      }
  117                 uvm_map_entry_link(map, prev_entry, new_entry);
      
                      /*
                       * Update the free space hint
                       */
      
                      if ((map->first_free == prev_entry) &&
   43                     (prev_entry->end >= new_entry->start))
                              map->first_free = new_entry;
      
                      new_entry = NULL;
              }
      
  146         map->size += size;
      
              UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
      
   18         error = 0;
      
      done:
              vm_map_unlock(map);
      
              if (new_entry) {
                      uvm_mapent_free(new_entry);
              }
   50         if (dead) {
                      KDASSERT(merged);
    9                 uvm_mapent_free(dead);
              }
  146         if (dead_entries)
   44                 uvm_unmap_detach(dead_entries, 0);
      
  146         return error;
      }
      
      /*
       * uvm_map_lookup_entry_bytree: lookup an entry in tree
       */
      
      static inline bool
      uvm_map_lookup_entry_bytree(struct vm_map *map, vaddr_t address,
          struct vm_map_entry **entry        /* OUT */)
      {
              struct vm_map_entry *prev = &map->header;
              struct vm_map_entry *cur = ROOT_ENTRY(map);
      
  383         while (cur) {
                      UVMMAP_EVCNT_INCR(mlk_treeloop);
  383                 if (address >= cur->start) {
  339                         if (address < cur->end) {
                                      *entry = cur;
                                      return true;
                              }
                              prev = cur;
  339                         cur = RIGHT_ENTRY(cur);
                      } else
  376                         cur = LEFT_ENTRY(cur);
              }
  114         *entry = prev;
              return false;
      }
      
      /*
       * uvm_map_lookup_entry: find map entry at or before an address
       *
       * => map must at least be read-locked by caller
       * => entry is returned in "entry"
       * => return value is true if address is in the returned entry
       */
      
      bool
      uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
          struct vm_map_entry **entry        /* OUT */)
      {
              struct vm_map_entry *cur;
              bool use_tree = false;
              UVMHIST_FUNC("uvm_map_lookup_entry");
              UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist,"(map=%#jx,addr=%#jx,ent=%#jx)",
                  (uintptr_t)map, address, (uintptr_t)entry, 0);
      
              /*
               * start looking either from the head of the
               * list, or from the hint.
               */
      
  416         cur = map->hint;
      
              if (cur == &map->header)
   33                 cur = cur->next;
      
              UVMMAP_EVCNT_INCR(mlk_call);
  416         if (address >= cur->start) {
      
                      /*
                       * go from hint to end of list.
                       *
                       * but first, make a quick check to see if
                       * we are already looking at the entry we
                       * want (which is usually the case).
                       * note also that we don't need to save the hint
                       * here... it is the same hint (unless we are
                       * at the header, in which case the hint didn't
                       * buy us anything anyway).
                       */
      
  337                 if (cur != &map->header && cur->end > address) {
                              UVMMAP_EVCNT_INCR(mlk_hint);
  271                         *entry = cur;
                              UVMHIST_LOG(maphist,"<- got it via hint (%#jx)",
                                  (uintptr_t)cur, 0, 0, 0);
                              uvm_mapent_check(*entry);
                              return (true);
                      }
      
  216                 if (map->nentries > 15)
                              use_tree = true;
              } else {
      
                      /*
                       * invalid hint.  use tree.
                       */
                      use_tree = true;
              }
      
  383         uvm_map_check(map, __func__);
      
              if (use_tree) {
                      /*
                       * Simple lookup in the tree.  Happens when the hint is
                       * invalid, or nentries reach a threshold.
                       */
                      UVMMAP_EVCNT_INCR(mlk_tree);
  383                 if (uvm_map_lookup_entry_bytree(map, address, entry)) {
                              goto got;
                      } else {
                              goto failed;
                      }
              }
      
              /*
               * search linearly
               */
      
              UVMMAP_EVCNT_INCR(mlk_list);
              while (cur != &map->header) {
                      UVMMAP_EVCNT_INCR(mlk_listloop);
    3                 if (cur->end > address) {
    3                         if (address >= cur->start) {
                                      /*
                                       * save this lookup for future
                                       * hints, and return
                                       */
      
  334                                 *entry = cur;
      got:
  334                                 SAVE_HINT(map, map->hint, *entry);
                                      UVMHIST_LOG(maphist,"<- search got it (%#jx)",
                                              (uintptr_t)cur, 0, 0, 0);
                                      KDASSERT((*entry)->start <= address);
  334                                 KDASSERT(address < (*entry)->end);
  334                                 uvm_mapent_check(*entry);
                                      return (true);
                              }
                              break;
                      }
    3                 cur = cur->next;
              }
              *entry = cur->prev;
      failed:
  114         SAVE_HINT(map, map->hint, *entry);
              UVMHIST_LOG(maphist,"<- failed!",0,0,0,0);
   57         KDASSERT((*entry) == &map->header || (*entry)->end <= address);
  114         KDASSERT((*entry)->next == &map->header ||
                  address < (*entry)->next->start);
  416         return (false);
      }
      
      /*
       * See if the range between start and start + length fits in the gap
       * entry->next->start and entry->end.  Returns 1 if fits, 0 if doesn't
       * fit, and -1 address wraps around.
       */
      static int
      uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset,
          vsize_t align, int flags, int topdown, struct vm_map_entry *entry)
      {
              vaddr_t end;
      
      #ifdef PMAP_PREFER
              /*
               * push start address forward as needed to avoid VAC alias problems.
               * we only do this if a valid offset is specified.
               */
      
              if (uoffset != UVM_UNKNOWN_OFFSET)
                      PMAP_PREFER(uoffset, start, length, topdown);
      #endif
  108         if ((flags & UVM_FLAG_COLORMATCH) != 0) {
   50                 KASSERT(align < uvmexp.ncolors);
   50                 if (uvmexp.ncolors > 1) {
   50                         const u_int colormask = uvmexp.colormask;
   50                         const u_int colorsize = colormask + 1;
                              vaddr_t hint = atop(*start);
                              const u_int color = hint & colormask;
                              if (color != align) {
   50                                 hint -= color;        /* adjust to color boundary */
                                      KASSERT((hint & colormask) == 0);
                                      if (topdown) {
                                              if (align > color)
                                                      hint -= colorsize;
                                      } else {
   50                                         if (align < color)
   49                                                 hint += colorsize;
                                      }
   50                                 *start = ptoa(hint + align); /* adjust to color */
                              }
                      }
              } else {
   59                 KASSERT(powerof2(align));
   59                 uvm_map_align_va(start, align, topdown);
                      /*
                       * XXX Should we PMAP_PREFER() here again?
                       * eh...i think we're okay
                       */
              }
      
              /*
               * Find the end of the proposed new region.  Be sure we didn't
               * wrap around the address; if so, we lose.  Otherwise, if the
               * proposed new region fits before the next entry, we win.
               */
      
  108         end = *start + length;
              if (end < *start)
                      return (-1);
      
  108         if (entry->next->start >= end && *start >= entry->end)
                      return (1);
      
              return (0);
      }
      
      /*
       * uvm_map_findspace: find "length" sized space in "map".
       *
       * => "hint" is a hint about where we want it, unless UVM_FLAG_FIXED is
       *        set in "flags" (in which case we insist on using "hint").
       * => "result" is VA returned
       * => uobj/uoffset are to be used to handle VAC alignment, if required
       * => if "align" is non-zero, we attempt to align to that value.
       * => caller must at least have read-locked map
       * => returns NULL on failure, or pointer to prev. map entry if success
       * => note this is a cross between the old vm_map_findspace and vm_map_find
       */
      
      struct vm_map_entry *
      uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length,
          vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset,
          vsize_t align, int flags)
      {
  114         struct vm_map_entry *entry;
              struct vm_map_entry *child, *prev, *tmp;
              vaddr_t orig_hint __diagused;
              const int topdown = map->flags & VM_MAP_TOPDOWN;
              UVMHIST_FUNC("uvm_map_findspace");
              UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist, "(map=%#jx, hint=%#jx, len=%ju, flags=%#jx)",
                  (uintptr_t)map, hint, length, flags);
   65         KASSERT((flags & UVM_FLAG_COLORMATCH) != 0 || powerof2(align));
   50         KASSERT((flags & UVM_FLAG_COLORMATCH) == 0 || align < uvmexp.ncolors);
  114         KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
      
  114         uvm_map_check(map, "map_findspace entry");
      
              /*
               * remember the original hint.  if we are aligning, then we
               * may have to try again with no alignment constraint if
               * we fail the first time.
               */
      
              orig_hint = hint;
              if (hint < vm_map_min(map)) {        /* check ranges ... */
                      if (flags & UVM_FLAG_FIXED) {
                              UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0);
                              return (NULL);
                      }
   45                 hint = vm_map_min(map);
              }
  114         if (hint > vm_map_max(map)) {
                      UVMHIST_LOG(maphist,"<- VA %#jx > range [%#jx->%#jx]",
                          hint, vm_map_min(map), vm_map_max(map), 0);
                      return (NULL);
              }
      
              /*
               * hint may not be aligned properly; we need round up or down it
               * before proceeding further.
               */
  114         if ((flags & UVM_FLAG_COLORMATCH) == 0)
   65                 uvm_map_align_va(&hint, align, topdown);
      
              /*
               * Look for the first possible address; if there's already
               * something at this address, we have to start after it.
               */
      
              /*
               * @@@: there are four, no, eight cases to consider.
               *
               * 0: found,     fixed,     bottom up -> fail
               * 1: found,     fixed,     top down  -> fail
               * 2: found,     not fixed, bottom up -> start after entry->end,
               *                                       loop up
               * 3: found,     not fixed, top down  -> start before entry->start,
               *                                       loop down
               * 4: not found, fixed,     bottom up -> check entry->next->start, fail
               * 5: not found, fixed,     top down  -> check entry->next->start, fail
               * 6: not found, not fixed, bottom up -> check entry->next->start,
               *                                       loop up
               * 7: not found, not fixed, top down  -> check entry->next->start,
               *                                       loop down
               *
               * as you can see, it reduces to roughly five cases, and that
               * adding top down mapping only adds one unique case (without
               * it, there would be four cases).
               */
      
  114         if ((flags & UVM_FLAG_FIXED) == 0 && hint == vm_map_min(map)) {
  108                 entry = map->first_free;
              } else {
    6                 if (uvm_map_lookup_entry(map, hint, &entry)) {
                              /* "hint" address already in use ... */
                              if (flags & UVM_FLAG_FIXED) {
                                      UVMHIST_LOG(maphist, "<- fixed & VA in use",
                                          0, 0, 0, 0);
                                      return (NULL);
                              }
                              if (topdown)
                                      /* Start from lower gap. */
                                      entry = entry->prev;
                      } else if (flags & UVM_FLAG_FIXED) {
    4                         if (entry->next->start >= hint + length &&
                                  hint + length > hint)
                                      goto found;
      
                              /* "hint" address is gap but too small */
                              UVMHIST_LOG(maphist, "<- fixed mapping failed",
                                  0, 0, 0, 0);
                              return (NULL); /* only one shot at it ... */
                      } else {
                              /*
                               * See if given hint fits in this gap.
                               */
                              switch (uvm_map_space_avail(&hint, length,
                                  uoffset, align, flags, topdown, entry)) {
                              case 1:
                                      goto found;
                              case -1:
                                      goto wraparound;
                              }
      
                              if (topdown) {
                                      /*
                                       * Still there is a chance to fit
                                       * if hint > entry->end.
                                       */
                              } else {
                                      /* Start from higher gap. */
                                      entry = entry->next;
                                      if (entry == &map->header)
                                              goto notfound;
                                      goto nextgap;
                              }
                      }
              }
      
              /*
               * Note that all UVM_FLAGS_FIXED case is already handled.
               */
              KDASSERT((flags & UVM_FLAG_FIXED) == 0);
      
              /* Try to find the space in the red-black tree */
      
              /* Check slot before any entry */
  108         hint = topdown ? entry->next->start - length : entry->end;
              switch (uvm_map_space_avail(&hint, length, uoffset, align, flags,
                  topdown, entry)) {
              case 1:
                      goto found;
              case -1:
                      goto wraparound;
              }
      
   72 nextgap:
              KDASSERT((flags & UVM_FLAG_FIXED) == 0);
              /* If there is not enough space in the whole tree, we fail */
   72         tmp = ROOT_ENTRY(map);
   72         if (tmp == NULL || tmp->maxgap < length)
                      goto notfound;
      
              prev = NULL; /* previous candidate */
      
              /* Find an entry close to hint that has enough space */
              for (; tmp;) {
   72                 KASSERT(tmp->next->start == tmp->end + tmp->gap);
   72                 if (topdown) {
                              if (tmp->next->start < hint + length &&
                                  (prev == NULL || tmp->end > prev->end)) {
                                      if (tmp->gap >= length)
                                              prev = tmp;
                                      else if ((child = LEFT_ENTRY(tmp)) != NULL
                                          && child->maxgap >= length)
                                              prev = tmp;
                              }
                      } else {
   72                         if (tmp->end >= hint &&
    5                             (prev == NULL || tmp->end < prev->end)) {
   72                                 if (tmp->gap >= length)
                                              prev = tmp;
   67                                 else if ((child = RIGHT_ENTRY(tmp)) != NULL
   67                                     && child->maxgap >= length)
                                              prev = tmp;
                              }
                      }
   72                 if (tmp->next->start < hint + length)
                              child = RIGHT_ENTRY(tmp);
   72                 else if (tmp->end > hint)
                              child = LEFT_ENTRY(tmp);
                      else {
    2                         if (tmp->gap >= length)
                                      break;
                              if (topdown)
   70                                 child = LEFT_ENTRY(tmp);
                              else
                                      child = RIGHT_ENTRY(tmp);
                      }
   70                 if (child == NULL || child->maxgap < length)
                              break;
                      tmp = child;
              }
      
   72         if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) {
                      /*
                       * Check if the entry that we found satifies the
                       * space requirement
                       */
    3                 if (topdown) {
                              if (hint > tmp->next->start - length)
                                      hint = tmp->next->start - length;
                      } else {
    3                         if (hint < tmp->end)
    1                                 hint = tmp->end;
                      }
    3                 switch (uvm_map_space_avail(&hint, length, uoffset, align,
                          flags, topdown, tmp)) {
                      case 1:
                              entry = tmp;
                              goto found;
                      case -1:
                              goto wraparound;
                      }
                      if (tmp->gap >= length)
                              goto listsearch;
              }
   70         if (prev == NULL)
                      goto notfound;
      
   70         if (topdown) {
                      KASSERT(orig_hint >= prev->next->start - length ||
                          prev->next->start - length > prev->next->start);
                      hint = prev->next->start - length;
              } else {
   70                 KASSERT(orig_hint <= prev->end);
   70                 hint = prev->end;
              }
   70         switch (uvm_map_space_avail(&hint, length, uoffset, align,
                  flags, topdown, prev)) {
              case 1:
    3                 entry = prev;
                      goto found;
              case -1:
                      goto wraparound;
              }
   67         if (prev->gap >= length)
                      goto listsearch;
      
   67         if (topdown)
                      tmp = LEFT_ENTRY(prev);
              else
   67                 tmp = RIGHT_ENTRY(prev);
              for (;;) {
   67                 KASSERT(tmp && tmp->maxgap >= length);
   67                 if (topdown)
                              child = RIGHT_ENTRY(tmp);
                      else
   67                         child = LEFT_ENTRY(tmp);
   67                 if (child && child->maxgap >= length) {
                              tmp = child;
                              continue;
                      }
   67                 if (tmp->gap >= length)
                              break;
   65                 if (topdown)
                              tmp = LEFT_ENTRY(tmp);
                      else
   65                         tmp = RIGHT_ENTRY(tmp);
              }
      
   67         if (topdown) {
                      KASSERT(orig_hint >= tmp->next->start - length ||
                          tmp->next->start - length > tmp->next->start);
                      hint = tmp->next->start - length;
              } else {
   67                 KASSERT(orig_hint <= tmp->end);
   67                 hint = tmp->end;
              }
   67         switch (uvm_map_space_avail(&hint, length, uoffset, align,
                  flags, topdown, tmp)) {
              case 1:
   65                 entry = tmp;
                      goto found;
              case -1:
                      goto wraparound;
              }
      
              /*
               * The tree fails to find an entry because of offset or alignment
               * restrictions.  Search the list instead.
               */
    5  listsearch:
              /*
               * Look through the rest of the map, trying to fit a new region in
               * the gap between existing regions, or after the very last region.
               * note: entry->end = base VA of current gap,
               *         entry->next->start = VA of end of current gap
               */
      
              for (;;) {
                      /* Update hint for current gap. */
    5                 hint = topdown ? entry->next->start - length : entry->end;
      
                      /* See if it fits. */
                      switch (uvm_map_space_avail(&hint, length, uoffset, align,
                          flags, topdown, entry)) {
                      case 1:
                              goto found;
                      case -1:
                              goto wraparound;
                      }
      
                      /* Advance to next/previous gap */
                      if (topdown) {
                              if (entry == &map->header) {
                                      UVMHIST_LOG(maphist, "<- failed (off start)",
                                          0,0,0,0);
                                      goto notfound;
                              }
                              entry = entry->prev;
                      } else {
    5                         entry = entry->next;
                              if (entry == &map->header) {
                                      UVMHIST_LOG(maphist, "<- failed (off end)",
                                          0,0,0,0);
                                      goto notfound;
                              }
                      }
              }
      
  112  found:
              SAVE_HINT(map, map->hint, entry);
              *result = hint;
              UVMHIST_LOG(maphist,"<- got it!  (result=%#jx)", hint, 0,0,0);
  108         KASSERTMSG( topdown || hint >= orig_hint, "hint: %jx, orig_hint: %jx",
                  (uintmax_t)hint, (uintmax_t)orig_hint);
    4         KASSERTMSG(!topdown || hint <= orig_hint, "hint: %jx, orig_hint: %jx",
                  (uintmax_t)hint, (uintmax_t)orig_hint);
  112         KASSERT(entry->end <= hint);
  112         KASSERT(hint + length <= entry->next->start);
              return (entry);
      
       wraparound:
              UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0);
      
  114         return (NULL);
      
       notfound:
              UVMHIST_LOG(maphist, "<- failed (notfound)", 0,0,0,0);
      
              return (NULL);
      }
      
      /*
       *   U N M A P   -   m a i n   h e l p e r   f u n c t i o n s
       */
      
      /*
       * uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop")
       *
       * => caller must check alignment and size
       * => map must be locked by caller
       * => we return a list of map entries that we've remove from the map
       *    in "entry_list"
       */
      
      void
      uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
          struct vm_map_entry **entry_list /* OUT */, int flags)
      {
  111         struct vm_map_entry *entry, *first_entry, *next;
              vaddr_t len;
              UVMHIST_FUNC("uvm_unmap_remove"); UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist,"(map=%#jx, start=%#jx, end=%#jx)",
                  (uintptr_t)map, start, end, 0);
              VM_MAP_RANGE_CHECK(map, start, end);
      
              uvm_map_check(map, "unmap_remove entry");
      
              /*
               * find first entry
               */
      
              if (uvm_map_lookup_entry(map, start, &first_entry) == true) {
                      /* clip and go... */
  109                 entry = first_entry;
   20                 UVM_MAP_CLIP_START(map, entry, start);
                      /* critical!  prevents stale hint */
  109                 SAVE_HINT(map, entry, entry->prev);
              } else {
    3                 entry = first_entry->next;
              }
      
              /*
               * Save the free space hint
               */
      
  111         if (map->first_free != &map->header && map->first_free->start >= start)
                      map->first_free = entry->prev;
      
              /*
               * note: we now re-use first_entry for a different task.  we remove
               * a number of map entries from the map and save them in a linked
               * list headed by "first_entry".  once we remove them from the map
               * the caller should unlock the map and drop the references to the
               * backing objects [c.f. uvm_unmap_detach].  the object is to
               * separate unmapping from reference dropping.  why?
               *   [1] the map has to be locked for unmapping
               *   [2] the map need not be locked for reference dropping
               *   [3] dropping references may trigger pager I/O, and if we hit
               *       a pager that does synchronous I/O we may have to wait for it.
               *   [4] we would like all waiting for I/O to occur with maps unlocked
               *       so that we don't block other threads.
               */
      
  111         first_entry = NULL;
              *entry_list = NULL;
      
              /*
               * break up the area into map entry sized regions and unmap.  note
               * that all mappings have to be removed before we can even consider
               * dropping references to amaps or VM objects (otherwise we could end
               * up with a mapping to a page on the free list which would be very bad)
               */
      
  111         while ((entry != &map->header) && (entry->start < end)) {
  109                 KASSERT((entry->flags & UVM_MAP_STATIC) == 0);
      
  109                 UVM_MAP_CLIP_END(map, entry, end);
  109                 next = entry->next;
                      len = entry->end - entry->start;
      
                      /*
                       * unwire before removing addresses from the pmap; otherwise
                       * unwiring will put the entries back into the pmap (XXX).
                       */
      
                      if (VM_MAPENT_ISWIRED(entry)) {
   14                         uvm_map_entry_unwire(map, entry);
                      }
  109                 if (flags & UVM_FLAG_VAONLY) {
      
                              /* nothing */
      
   80                 } else if ((map->flags & VM_MAP_PAGEABLE) == 0) {
      
                              /*
                               * if the map is non-pageable, any pages mapped there
                               * must be wired and entered with pmap_kenter_pa(),
                               * and we should free any such pages immediately.
                               * this is mostly used for kmem_map.
                               */
   31                         KASSERT(vm_map_pmap(map) == pmap_kernel());
      
   31                         uvm_km_pgremove_intrsafe(map, entry->start, entry->end);
   55                 } else if (UVM_ET_ISOBJ(entry) &&
    7                            UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
                              panic("%s: kernel object %p %p\n",
                                  __func__, map, entry);
   51                 } else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) {
                              /*
                               * remove mappings the standard way.  lock object
                               * and/or amap to ensure vm_page state does not
                               * change while in pmap_remove().
                               */
      
   41                         uvm_map_lock_entry(entry);
                              pmap_remove(map->pmap, entry->start, entry->end);
                              uvm_map_unlock_entry(entry);
                      }
      
      #if defined(UVMDEBUG)
                      /*
                       * check if there's remaining mapping,
                       * which is a bug in caller.
                       */
      
                      vaddr_t va;
                      for (va = entry->start; va < entry->end;
                          va += PAGE_SIZE) {
                              if (pmap_extract(vm_map_pmap(map), va, NULL)) {
                                      panic("%s: %#"PRIxVADDR" has mapping",
                                          __func__, va);
                              }
                      }
      
                      if (VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
                              uvm_km_check_empty(map, entry->start,
                                  entry->end);
                      }
      #endif /* defined(UVMDEBUG) */
      
                      /*
                       * remove entry from map and put it on our list of entries
                       * that we've nuked.  then go to next entry.
                       */
      
                      UVMHIST_LOG(maphist, "  removed map entry %#jx",
                          (uintptr_t)entry, 0, 0, 0);
      
                      /* critical!  prevents stale hint */
  109                 SAVE_HINT(map, entry, entry->prev);
      
  109                 uvm_map_entry_unlink(map, entry);
                      KASSERT(map->size >= len);
  109                 map->size -= len;
                      entry->prev = NULL;
                      entry->next = first_entry;
                      first_entry = entry;
                      entry = next;
              }
      
              /*
               * Note: if map is dying, leave pmap_update() for pmap_destroy(),
               * which will be called later.
               */
  111         if ((map->flags & VM_MAP_DYING) == 0) {
  111                 pmap_update(vm_map_pmap(map));
              } else {
                      KASSERT(vm_map_pmap(map) != pmap_kernel());
              }
      
  111         uvm_map_check(map, "unmap_remove leave");
      
              /*
               * now we've cleaned up the map and are ready for the caller to drop
               * references to the mapped objects.
               */
      
              *entry_list = first_entry;
              UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
      
  111         if (map->flags & VM_MAP_WANTVA) {
    1                 mutex_enter(&map->misc_lock);
                      map->flags &= ~VM_MAP_WANTVA;
                      cv_broadcast(&map->cv);
                      mutex_exit(&map->misc_lock);
              }
      }
      
      /*
       * uvm_unmap_detach: drop references in a chain of map entries
       *
       * => we will free the map entries as we traverse the list.
       */
      
      void
      uvm_unmap_detach(struct vm_map_entry *first_entry, int flags)
      {
              struct vm_map_entry *next_entry;
              UVMHIST_FUNC("uvm_unmap_detach"); UVMHIST_CALLED(maphist);
      
  109         while (first_entry) {
  109                 KASSERT(!VM_MAPENT_ISWIRED(first_entry));
                      UVMHIST_LOG(maphist,
                          "  detach %#jx: amap=%#jx, obj=%#jx, submap?=%jd",
                          (uintptr_t)first_entry,
                          (uintptr_t)first_entry->aref.ar_amap,
                          (uintptr_t)first_entry->object.uvm_obj,
                          UVM_ET_ISSUBMAP(first_entry));
      
                      /*
                       * drop reference to amap, if we've got one
                       */
      
  109                 if (first_entry->aref.ar_amap)
   37                         uvm_map_unreference_amap(first_entry, flags);
      
                      /*
                       * drop reference to our backing object, if we've got one
                       */
      
  109                 KASSERT(!UVM_ET_ISSUBMAP(first_entry));
  109                 if (UVM_ET_ISOBJ(first_entry) &&
   37                     first_entry->object.uvm_obj->pgops->pgo_detach) {
   37                         (*first_entry->object.uvm_obj->pgops->pgo_detach)
                                      (first_entry->object.uvm_obj);
                      }
  109                 next_entry = first_entry->next;
                      uvm_mapent_free(first_entry);
                      first_entry = next_entry;
              }
              UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
      }
      
      /*
       *   E X T R A C T I O N   F U N C T I O N S
       */
      
      /*
       * uvm_map_reserve: reserve space in a vm_map for future use.
       *
       * => we reserve space in a map by putting a dummy map entry in the
       *    map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE)
       * => map should be unlocked (we will write lock it)
       * => we return true if we were able to reserve space
       * => XXXCDC: should be inline?
       */
      
      int
      uvm_map_reserve(struct vm_map *map, vsize_t size,
          vaddr_t offset        /* hint for pmap_prefer */,
          vsize_t align        /* alignment */,
          vaddr_t *raddr        /* IN:hint, OUT: reserved VA */,
          uvm_flag_t flags        /* UVM_FLAG_FIXED or UVM_FLAG_COLORMATCH or 0 */)
      {
              UVMHIST_FUNC("uvm_map_reserve"); UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist, "(map=%#jx, size=%#jx, offset=%#jx, addr=%#jx)",
                  (uintptr_t)map, size, offset, (uintptr_t)raddr);
      
              size = round_page(size);
      
              /*
               * reserve some virtual space.
               */
      
              if (uvm_map(map, raddr, size, NULL, offset, align,
                  UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
                  UVM_ADV_RANDOM, UVM_FLAG_NOMERGE|flags)) != 0) {
                  UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0);
                      return (false);
              }
      
              UVMHIST_LOG(maphist, "<- done (*raddr=%#jx)", *raddr,0,0,0);
              return (true);
      }
      
      /*
       * uvm_map_replace: replace a reserved (blank) area of memory with
       * real mappings.
       *
       * => caller must WRITE-LOCK the map
       * => we return true if replacement was a success
       * => we expect the newents chain to have nnewents entrys on it and
       *    we expect newents->prev to point to the last entry on the list
       * => note newents is allowed to be NULL
       */
      
      static int
      uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end,
          struct vm_map_entry *newents, int nnewents, vsize_t nsize,
          struct vm_map_entry **oldentryp)
      {
    5         struct vm_map_entry *oldent, *last;
      
              uvm_map_check(map, "map_replace entry");
      
              /*
               * first find the blank map entry at the specified address
               */
      
              if (!uvm_map_lookup_entry(map, start, &oldent)) {
    5                 return (false);
              }
      
              /*
               * check to make sure we have a proper blank entry
               */
      
    5         if (end < oldent->end) {
                      UVM_MAP_CLIP_END(map, oldent, end);
              }
    5         if (oldent->start != start || oldent->end != end ||
    5             oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) {
                      return (false);
              }
      
      #ifdef DIAGNOSTIC
      
              /*
               * sanity check the newents chain
               */
      
              {
                      struct vm_map_entry *tmpent = newents;
                      int nent = 0;
                      vsize_t sz = 0;
                      vaddr_t cur = start;
      
    5                 while (tmpent) {
                              nent++;
    5                         sz += tmpent->end - tmpent->start;
                              if (tmpent->start < cur)
                                      panic("uvm_map_replace1");
    5                         if (tmpent->start >= tmpent->end || tmpent->end > end) {
                                      panic("uvm_map_replace2: "
                                          "tmpent->start=%#"PRIxVADDR
                                          ", tmpent->end=%#"PRIxVADDR
                                          ", end=%#"PRIxVADDR,
                                          tmpent->start, tmpent->end, end);
                              }
                              cur = tmpent->end;
    5                         if (tmpent->next) {
                                      if (tmpent->next->prev != tmpent)
                                              panic("uvm_map_replace3");
                              } else {
    5                                 if (newents->prev != tmpent)
                                              panic("uvm_map_replace4");
                              }
                              tmpent = tmpent->next;
                      }
    5                 if (nent != nnewents)
                              panic("uvm_map_replace5");
    5                 if (sz != nsize)
                              panic("uvm_map_replace6");
              }
      #endif
      
              /*
               * map entry is a valid blank!   replace it.   (this does all the
               * work of map entry link/unlink...).
               */
      
    5         if (newents) {
    5                 last = newents->prev;
      
                      /* critical: flush stale hints out of map */
                      SAVE_HINT(map, map->hint, newents);
                      if (map->first_free == oldent)
                              map->first_free = last;
      
    5                 last->next = oldent->next;
                      last->next->prev = last;
      
                      /* Fix RB tree */
                      uvm_rb_remove(map, oldent);
      
                      newents->prev = oldent->prev;
                      newents->prev->next = newents;
                      map->nentries = map->nentries + (nnewents - 1);
      
                      /* Fixup the RB tree */
                      {
                              int i;
                              struct vm_map_entry *tmp;
      
                              tmp = newents;
                              for (i = 0; i < nnewents && tmp; i++) {
    5                                 uvm_rb_insert(map, tmp);
                                      tmp = tmp->next;
                              }
                      }
              } else {
                      /* NULL list of new entries: just remove the old one */
                      clear_hints(map, oldent);
                      uvm_map_entry_unlink(map, oldent);
              }
    5         map->size -= end - start - nsize;
      
              uvm_map_check(map, "map_replace leave");
      
              /*
               * now we can free the old blank entry and return.
               */
      
              *oldentryp = oldent;
              return (true);
      }
      
      /*
       * uvm_map_extract: extract a mapping from a map and put it somewhere
       *        (maybe removing the old mapping)
       *
       * => maps should be unlocked (we will write lock them)
       * => returns 0 on success, error code otherwise
       * => start must be page aligned
       * => len must be page sized
       * => flags:
       *      UVM_EXTRACT_REMOVE: remove mappings from srcmap
       *      UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only)
       *      UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs
       *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
       *      UVM_EXTRACT_PROT_ALL: set prot to UVM_PROT_ALL as we go
       *    >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<<
       *    >>>NOTE: QREF's must be unmapped via the QREF path, thus should only
       *             be used from within the kernel in a kernel level map <<<
       */
      
      int
      uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
          struct vm_map *dstmap, vaddr_t *dstaddrp, int flags)
      {
    5         vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge;
              struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry,
                  *deadentry, *oldentry;
              struct vm_map_entry *resentry = NULL; /* a dummy reservation entry */
              vsize_t elen __unused;
              int nchain, error, copy_ok;
              vsize_t nsize;
              UVMHIST_FUNC("uvm_map_extract"); UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist,"(srcmap=%#jx,start=%#jx, len=%#jx",
                  (uintptr_t)srcmap, start, len, 0);
              UVMHIST_LOG(maphist," ...,dstmap=%#jx, flags=%#jx)",
                  (uintptr_t)dstmap, flags, 0, 0);
      
              /*
               * step 0: sanity check: start must be on a page boundary, length
               * must be page sized.  can't ask for CONTIG/QREF if you asked for
               * REMOVE.
               */
      
              KASSERT((start & PAGE_MASK) == 0 && (len & PAGE_MASK) == 0);
    5         KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 ||
                      (flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0);
      
              /*
               * step 1: reserve space in the target map for the extracted area
               */
      
    5         if ((flags & UVM_EXTRACT_RESERVED) == 0) {
    5                 dstaddr = vm_map_min(dstmap);
                      if (!uvm_map_reserve(dstmap, len, start, 
                          atop(start) & uvmexp.colormask, &dstaddr,
                          UVM_FLAG_COLORMATCH))
                              return (ENOMEM);
    5                 KASSERT((atop(start ^ dstaddr) & uvmexp.colormask) == 0);
    5                 *dstaddrp = dstaddr;        /* pass address back to caller */
                      UVMHIST_LOG(maphist, "  dstaddr=%#jx", dstaddr,0,0,0);
              } else {
                      dstaddr = *dstaddrp;
              }
      
              /*
               * step 2: setup for the extraction process loop by init'ing the
               * map entry chain, locking src map, and looking up the first useful
               * entry in the map.
               */
      
    5         end = start + len;
              newend = dstaddr + len;
              chain = endchain = NULL;
              nchain = 0;
              nsize = 0;
              vm_map_lock(srcmap);
      
              if (uvm_map_lookup_entry(srcmap, start, &entry)) {
      
                      /* "start" is within an entry */
    5                 if (flags & UVM_EXTRACT_QREF) {
      
                              /*
                               * for quick references we don't clip the entry, so
                               * the entry may map space "before" the starting
                               * virtual address... this is the "fudge" factor
                               * (which can be non-zero only the first time
                               * through the "while" loop in step 3).
                               */
      
    5                         fudge = start - entry->start;
                      } else {
      
                              /*
                               * normal reference: we clip the map to fit (thus
                               * fudge is zero)
                               */
      
                              UVM_MAP_CLIP_START(srcmap, entry, start);
                              SAVE_HINT(srcmap, srcmap->hint, entry->prev);
                              fudge = 0;
                      }
              } else {
      
                      /* "start" is not within an entry ... skip to next entry */
                      if (flags & UVM_EXTRACT_CONTIG) {
                              error = EINVAL;
                              goto bad;    /* definite hole here ... */
                      }
      
                      entry = entry->next;
                      fudge = 0;
              }
      
              /* save values from srcmap for step 6 */
              orig_entry = entry;
              orig_fudge = fudge;
      
              /*
               * step 3: now start looping through the map entries, extracting
               * as we go.
               */
      
    5         while (entry->start < end && entry != &srcmap->header) {
      
                      /* if we are not doing a quick reference, clip it */
    5                 if ((flags & UVM_EXTRACT_QREF) == 0)
                              UVM_MAP_CLIP_END(srcmap, entry, end);
      
                      /* clear needs_copy (allow chunking) */
    5                 if (UVM_ET_ISNEEDSCOPY(entry)) {
    4                         amap_copy(srcmap, entry,
                                  AMAP_COPY_NOWAIT|AMAP_COPY_NOMERGE, start, end);
                              if (UVM_ET_ISNEEDSCOPY(entry)) {  /* failed? */
                                      error = ENOMEM;
                                      goto bad;
                              }
      
                              /* amap_copy could clip (during chunk)!  update fudge */
    4                         if (fudge) {
                                      fudge = start - entry->start;
                                      orig_fudge = fudge;
                              }
                      }
      
                      /* calculate the offset of this from "start" */
                      oldoffset = (entry->start + fudge) - start;
      
                      /* allocate a new map entry */
    5                 newentry = uvm_mapent_alloc(dstmap, 0);
                      if (newentry == NULL) {
                              error = ENOMEM;
                              goto bad;
                      }
      
                      /* set up new map entry */
                      newentry->next = NULL;
                      newentry->prev = endchain;
                      newentry->start = dstaddr + oldoffset;
    1                 newentry->end =
                          newentry->start + (entry->end - (entry->start + fudge));
    1                 if (newentry->end > newend || newentry->end < newentry->start)
    4                         newentry->end = newend;
    5                 newentry->object.uvm_obj = entry->object.uvm_obj;
                      if (newentry->object.uvm_obj) {
    3                         if (newentry->object.uvm_obj->pgops->pgo_reference)
    3                                 newentry->object.uvm_obj->pgops->
                                          pgo_reference(newentry->object.uvm_obj);
    3                         newentry->offset = entry->offset + fudge;
                      } else {
    3                         newentry->offset = 0;
                      }
    5                 newentry->etype = entry->etype;
    2                 if (flags & UVM_EXTRACT_PROT_ALL) {
                              newentry->protection = newentry->max_protection =
                                  UVM_PROT_ALL;
                      } else {
    5                         newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ?
                                  entry->max_protection : entry->protection;
                              newentry->max_protection = entry->max_protection;
                      }
    5                 newentry->inheritance = entry->inheritance;
                      newentry->wired_count = 0;
                      newentry->aref.ar_amap = entry->aref.ar_amap;
                      if (newentry->aref.ar_amap) {
                              newentry->aref.ar_pageoff =
    4                             entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT);
                              uvm_map_reference_amap(newentry, AMAP_SHARED |
    5                             ((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0));
                      } else {
    2                         newentry->aref.ar_pageoff = 0;
                      }
    5                 newentry->advice = entry->advice;
                      if ((flags & UVM_EXTRACT_QREF) != 0) {
    5                         newentry->flags |= UVM_MAP_NOMERGE;
                      }
      
                      /* now link it on the chain */
    5                 nchain++;
                      nsize += newentry->end - newentry->start;
                      if (endchain == NULL) {
                              chain = endchain = newentry;
                      } else {
                              endchain->next = newentry;
                              endchain = newentry;
                      }
      
                      /* end of 'while' loop! */
    5                 if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end &&
                          (entry->next == &srcmap->header ||
                          entry->next->start != entry->end)) {
                              error = EINVAL;
                              goto bad;
                      }
    5                 entry = entry->next;
                      fudge = 0;
              }
      
              /*
               * step 4: close off chain (in format expected by uvm_map_replace)
               */
      
    5         if (chain)
    5                 chain->prev = endchain;
      
              /*
               * step 5: attempt to lock the dest map so we can pmap_copy.
               * note usage of copy_ok:
               *   1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5)
               *   0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7
               */
      
    5         if (srcmap == dstmap || vm_map_lock_try(dstmap) == true) {
                      copy_ok = 1;
    5                 if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
                          nchain, nsize, &resentry)) {
                              if (srcmap != dstmap)
                                      vm_map_unlock(dstmap);
                              error = EIO;
                              goto bad;
                      }
              } else {
                      copy_ok = 0;
                      /* replace defered until step 7 */
              }
      
              /*
               * step 6: traverse the srcmap a second time to do the following:
               *  - if we got a lock on the dstmap do pmap_copy
               *  - if UVM_EXTRACT_REMOVE remove the entries
               * we make use of orig_entry and orig_fudge (saved in step 2)
               */
      
              if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) {
      
                      /* purge possible stale hints from srcmap */
    5                 if (flags & UVM_EXTRACT_REMOVE) {
                              SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev);
                              if (srcmap->first_free != &srcmap->header &&
                                  srcmap->first_free->start >= start)
                                      srcmap->first_free = orig_entry->prev;
                      }
      
    5                 entry = orig_entry;
                      fudge = orig_fudge;
                      deadentry = NULL;        /* for UVM_EXTRACT_REMOVE */
      
    5                 while (entry->start < end && entry != &srcmap->header) {
                              if (copy_ok) {
                                      oldoffset = (entry->start + fudge) - start;
                                      elen = MIN(end, entry->end) -
                                          (entry->start + fudge);
                                      pmap_copy(dstmap->pmap, srcmap->pmap,
                                          dstaddr + oldoffset, elen,
                                          entry->start + fudge);
                              }
      
                              /* we advance "entry" in the following if statement */
    5                         if (flags & UVM_EXTRACT_REMOVE) {
                                      uvm_map_lock_entry(entry);
                                      pmap_remove(srcmap->pmap, entry->start,
                                                      entry->end);
                                      uvm_map_unlock_entry(entry);
                                      oldentry = entry;        /* save entry */
                                      entry = entry->next;        /* advance */
                                      uvm_map_entry_unlink(srcmap, oldentry);
                                                              /* add to dead list */
                                      oldentry->next = deadentry;
                                      deadentry = oldentry;
                              } else {
    5                                 entry = entry->next;                /* advance */
                              }
      
                              /* end of 'while' loop */
                              fudge = 0;
                      }
    5                 pmap_update(srcmap->pmap);
      
                      /*
                       * unlock dstmap.  we will dispose of deadentry in
                       * step 7 if needed
                       */
      
    5                 if (copy_ok && srcmap != dstmap)
    5                         vm_map_unlock(dstmap);
      
              } else {
                      deadentry = NULL;
              }
      
              /*
               * step 7: we are done with the source map, unlock.   if copy_ok
               * is 0 then we have not replaced the dummy mapping in dstmap yet
               * and we need to do so now.
               */
      
    5         vm_map_unlock(srcmap);
              if ((flags & UVM_EXTRACT_REMOVE) && deadentry)
                      uvm_unmap_detach(deadentry, 0);   /* dispose of old entries */
      
              /* now do the replacement if we didn't do it in step 5 */
              if (copy_ok == 0) {
                      vm_map_lock(dstmap);
                      error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
                          nchain, nsize, &resentry);
                      vm_map_unlock(dstmap);
      
                      if (error == false) {
                              error = EIO;
                              goto bad2;
                      }
              }
      
    5         if (resentry != NULL)
    5                 uvm_mapent_free(resentry);
      
              return (0);
      
              /*
               * bad: failure recovery
               */
      bad:
              vm_map_unlock(srcmap);
      bad2:                        /* src already unlocked */
              if (chain)
                      uvm_unmap_detach(chain,
                          (flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0);
      
              if (resentry != NULL)
                      uvm_mapent_free(resentry);
      
    5         if ((flags & UVM_EXTRACT_RESERVED) == 0) {
                      uvm_unmap(dstmap, dstaddr, dstaddr+len);   /* ??? */
              }
              return (error);
      }
      
      /* end of extraction functions */
      
      /*
       * uvm_map_submap: punch down part of a map into a submap
       *
       * => only the kernel_map is allowed to be submapped
       * => the purpose of submapping is to break up the locking granularity
       *        of a larger map
       * => the range specified must have been mapped previously with a uvm_map()
       *        call [with uobj==NULL] to create a blank map entry in the main map.
       *        [And it had better still be blank!]
       * => maps which contain submaps should never be copied or forked.
       * => to remove a submap, use uvm_unmap() on the main map
       *        and then uvm_map_deallocate() the submap.
       * => main map must be unlocked.
       * => submap must have been init'd and have a zero reference count.
       *        [need not be locked as we don't actually reference it]
       */
      
      int
      uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
          struct vm_map *submap)
      {
              struct vm_map_entry *entry;
              int error;
      
              vm_map_lock(map);
              VM_MAP_RANGE_CHECK(map, start, end);
      
              if (uvm_map_lookup_entry(map, start, &entry)) {
                      UVM_MAP_CLIP_START(map, entry, start);
                      UVM_MAP_CLIP_END(map, entry, end);        /* to be safe */
              } else {
                      entry = NULL;
              }
      
              if (entry != NULL &&
                  entry->start == start && entry->end == end &&
                  entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
                  !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
                      entry->etype |= UVM_ET_SUBMAP;
                      entry->object.sub_map = submap;
                      entry->offset = 0;
                      uvm_map_reference(submap);
                      error = 0;
              } else {
                      error = EINVAL;
              }
              vm_map_unlock(map);
      
              return error;
      }
      
      /*
       * uvm_map_protect_user: change map protection on behalf of the user.
       * Enforces PAX settings as necessary.
       */
      int
      uvm_map_protect_user(struct lwp *l, vaddr_t start, vaddr_t end,
          vm_prot_t new_prot)
      {
              int error;
      
   25         if ((error = PAX_MPROTECT_VALIDATE(l, new_prot)))
                      return error;
      
   24         return uvm_map_protect(&l->l_proc->p_vmspace->vm_map, start, end,
                  new_prot, false);
      }
      
      
      /*
       * uvm_map_protect: change map protection
       *
       * => set_max means set max_protection.
       * => map must be unlocked.
       */
      
      #define MASK(entry)        (UVM_ET_ISCOPYONWRITE(entry) ? \
                               ~VM_PROT_WRITE : VM_PROT_ALL)
      
      int
      uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
          vm_prot_t new_prot, bool set_max)
      {
   25         struct vm_map_entry *current, *entry;
              int error = 0;
              UVMHIST_FUNC("uvm_map_protect"); UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_prot=%#jx)",
                  (uintptr_t)map, start, end, new_prot);
      
              vm_map_lock(map);
              VM_MAP_RANGE_CHECK(map, start, end);
              if (uvm_map_lookup_entry(map, start, &entry)) {
   25                 UVM_MAP_CLIP_START(map, entry, start);
              } else {
    1                 entry = entry->next;
              }
      
              /*
               * make a first pass to check for protection violations.
               */
      
              current = entry;
   25         while ((current != &map->header) && (current->start < end)) {
   25                 if (UVM_ET_ISSUBMAP(current)) {
                              error = EINVAL;
                              goto out;
                      }
   25                 if ((new_prot & current->max_protection) != new_prot) {
                              error = EACCES;
                              goto out;
                      }
                      /*
                       * Don't allow VM_PROT_EXECUTE to be set on entries that
                       * point to vnodes that are associated with a NOEXEC file
                       * system.
                       */
   24                 if (UVM_ET_ISOBJ(current) &&
    4                     UVM_OBJ_IS_VNODE(current->object.uvm_obj)) {
                              struct vnode *vp =
                                  (struct vnode *) current->object.uvm_obj;
      
    2                         if ((new_prot & VM_PROT_EXECUTE) != 0 &&
    1                             (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
                                      error = EACCES;
                                      goto out;
                              }
                      }
      
   24                 current = current->next;
              }
      
              /* go back and fix up protections (no need to clip this time). */
      
              current = entry;
   23         while ((current != &map->header) && (current->start < end)) {
                      vm_prot_t old_prot;
      
   23                 UVM_MAP_CLIP_END(map, current, end);
   23                 old_prot = current->protection;
                      if (set_max)
                              current->protection =
                                  (current->max_protection = new_prot) & old_prot;
                      else
   23                         current->protection = new_prot;
      
                      /*
                       * update physical map if necessary.  worry about copy-on-write
                       * here -- CHECK THIS XXX
                       */
      
   23                 if (current->protection != old_prot) {
                              /* update pmap! */
   23                         uvm_map_lock_entry(current);
   23                         pmap_protect(map->pmap, current->start, current->end,
                                  current->protection & MASK(current));
   23                         uvm_map_unlock_entry(current);
      
                              /*
                               * If this entry points at a vnode, and the
                               * protection includes VM_PROT_EXECUTE, mark
                               * the vnode as VEXECMAP.
                               */
                              if (UVM_ET_ISOBJ(current)) {
    4                                 struct uvm_object *uobj =
                                          current->object.uvm_obj;
      
                                      if (UVM_OBJ_IS_VNODE(uobj) &&
    2                                     (current->protection & VM_PROT_EXECUTE)) {
    1                                         vn_markexec((struct vnode *) uobj);
                                      }
                              }
                      }
      
                      /*
                       * If the map is configured to lock any future mappings,
                       * wire this entry now if the old protection was VM_PROT_NONE
                       * and the new protection is not VM_PROT_NONE.
                       */
      
   23                 if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
    3                     VM_MAPENT_ISWIRED(current) == 0 &&
    3                     old_prot == VM_PROT_NONE &&
                          new_prot != VM_PROT_NONE) {
      
                              /*
                               * We must call pmap_update() here because the
                               * pmap_protect() call above might have removed some
                               * pmap entries and uvm_map_pageable() might create
                               * some new pmap entries that rely on the prior
                               * removals being completely finished.
                               */
      
    2                         pmap_update(map->pmap);
      
                              if (uvm_map_pageable(map, current->start,
                                  current->end, false,
                                  UVM_LK_ENTER|UVM_LK_EXIT) != 0) {
      
                                      /*
                                       * If locking the entry fails, remember the
                                       * error if it's the first one.  Note we
                                       * still continue setting the protection in
                                       * the map, but will return the error
                                       * condition regardless.
                                       *
                                       * XXX Ignore what the actual error is,
                                       * XXX just call it a resource shortage
                                       * XXX so that it doesn't get confused
                                       * XXX what uvm_map_protect() itself would
                                       * XXX normally return.
                                       */
      
                                      error = ENOMEM;
                              }
                      }
   23                 current = current->next;
              }
   23         pmap_update(map->pmap);
      
   25  out:
              vm_map_unlock(map);
      
              UVMHIST_LOG(maphist, "<- done, error=%jd",error,0,0,0);
              return error;
      }
      
      #undef  MASK
      
      /*
       * uvm_map_inherit: set inheritance code for range of addrs in map.
       *
       * => map must be unlocked
       * => note that the inherit code is used during a "fork".  see fork
       *        code for details.
       */
      
      int
      uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
          vm_inherit_t new_inheritance)
      {
              struct vm_map_entry *entry, *temp_entry;
              UVMHIST_FUNC("uvm_map_inherit"); UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_inh=%#jx)",
                  (uintptr_t)map, start, end, new_inheritance);
      
              switch (new_inheritance) {
              case MAP_INHERIT_NONE:
              case MAP_INHERIT_COPY:
              case MAP_INHERIT_SHARE:
              case MAP_INHERIT_ZERO:
                      break;
              default:
                      UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
                      return EINVAL;
              }
      
              vm_map_lock(map);
              VM_MAP_RANGE_CHECK(map, start, end);
              if (uvm_map_lookup_entry(map, start, &temp_entry)) {
                      entry = temp_entry;
                      UVM_MAP_CLIP_START(map, entry, start);
              }  else {
                      entry = temp_entry->next;
              }
              while ((entry != &map->header) && (entry->start < end)) {
                      UVM_MAP_CLIP_END(map, entry, end);
                      entry->inheritance = new_inheritance;
                      entry = entry->next;
              }
              vm_map_unlock(map);
              UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
              return 0;
      }
      
      /*
       * uvm_map_advice: set advice code for range of addrs in map.
       *
       * => map must be unlocked
       */
      
      int
      uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
      {
    7         struct vm_map_entry *entry, *temp_entry;
              UVMHIST_FUNC("uvm_map_advice"); UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_adv=%#jx)",
                  (uintptr_t)map, start, end, new_advice);
      
              vm_map_lock(map);
              VM_MAP_RANGE_CHECK(map, start, end);
              if (uvm_map_lookup_entry(map, start, &temp_entry)) {
    6                 entry = temp_entry;
    3                 UVM_MAP_CLIP_START(map, entry, start);
              } else {
    1                 entry = temp_entry->next;
              }
      
              /*
               * XXXJRT: disallow holes?
               */
      
    7         while ((entry != &map->header) && (entry->start < end)) {
    5                 UVM_MAP_CLIP_END(map, entry, end);
      
    5                 switch (new_advice) {
                      case MADV_NORMAL:
                      case MADV_RANDOM:
                      case MADV_SEQUENTIAL:
                              /* nothing special here */
                              break;
      
                      default:
                              vm_map_unlock(map);
                              UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
                              return EINVAL;
                      }
    5                 entry->advice = new_advice;
                      entry = entry->next;
              }
      
    7         vm_map_unlock(map);
              UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
    7         return 0;
      }
      
      /*
       * uvm_map_willneed: apply MADV_WILLNEED
       */
      
      int
      uvm_map_willneed(struct vm_map *map, vaddr_t start, vaddr_t end)
      {
    4         struct vm_map_entry *entry;
              UVMHIST_FUNC("uvm_map_willneed"); UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist,"(map=%#jx,start=%#jx,end=%#jx)",
                  (uintptr_t)map, start, end, 0);
      
              vm_map_lock_read(map);
              VM_MAP_RANGE_CHECK(map, start, end);
              if (!uvm_map_lookup_entry(map, start, &entry)) {
    1                 entry = entry->next;
              }
    4         while (entry->start < end) {
    4                 struct vm_amap * const amap = entry->aref.ar_amap;
                      struct uvm_object * const uobj = entry->object.uvm_obj;
      
                      KASSERT(entry != &map->header);
    4                 KASSERT(start < entry->end);
                      /*
                       * For now, we handle only the easy but commonly-requested case.
                       * ie. start prefetching of backing uobj pages.
                       *
                       * XXX It might be useful to pmap_enter() the already-in-core
                       * pages by inventing a "weak" mode for uvm_fault() which would
                       * only do the PGO_LOCKED pgo_get().
                       */
    4                 if (UVM_ET_ISOBJ(entry) && amap == NULL && uobj != NULL) {
                              off_t offset;
                              off_t size;
      
    4                         offset = entry->offset;
                              if (start < entry->start) {
    1                                 offset += entry->start - start;
                              }
    4                         size = entry->offset + (entry->end - entry->start);
                              if (entry->end < end) {
    4                                 size -= end - entry->end;
                              }
                              uvm_readahead(uobj, offset, size);
                      }
    4                 entry = entry->next;
              }
    4         vm_map_unlock_read(map);
              UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
              return 0;
      }
      
      /*
       * uvm_map_pageable: sets the pageability of a range in a map.
       *
       * => wires map entries.  should not be used for transient page locking.
       *        for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()).
       * => regions specified as not pageable require lock-down (wired) memory
       *        and page tables.
       * => map must never be read-locked
       * => if islocked is true, map is already write-locked
       * => we always unlock the map, since we must downgrade to a read-lock
       *        to call uvm_fault_wire()
       * => XXXCDC: check this and try and clean it up.
       */
      
      int
      uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
          bool new_pageable, int lockflags)
      {
   40         struct vm_map_entry *entry, *start_entry, *failed_entry;
              int rv;
      #ifdef DIAGNOSTIC
              u_int timestamp_save;
      #endif
              UVMHIST_FUNC("uvm_map_pageable"); UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_pageable=%ju)",
                  (uintptr_t)map, start, end, new_pageable);
              KASSERT(map->flags & VM_MAP_PAGEABLE);
      
   40         if ((lockflags & UVM_LK_ENTER) == 0)
   23                 vm_map_lock(map);
   40         VM_MAP_RANGE_CHECK(map, start, end);
      
              /*
               * only one pageability change may take place at one time, since
               * uvm_fault_wire assumes it will be called only once for each
               * wiring/unwiring.  therefore, we have to make sure we're actually
               * changing the pageability for the entire region.  we do so before
               * making any changes.
               */
      
   40         if (uvm_map_lookup_entry(map, start, &start_entry) == false) {
                      if ((lockflags & UVM_LK_EXIT) == 0)
                              vm_map_unlock(map);
      
                      UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0);
   39                 return EFAULT;
              }
   40         entry = start_entry;
      
              if (start == end) {                /* nothing required */
    4                 if ((lockflags & UVM_LK_EXIT) == 0)
                              vm_map_unlock(map);
      
                      UVMHIST_LOG(maphist,"<- done (nothing)",0,0,0,0);
                      return 0;
              }
      
              /*
               * handle wiring and unwiring separately.
               */
      
   40         if (new_pageable) {                /* unwire */
    8                 UVM_MAP_CLIP_START(map, entry, start);
      
                      /*
                       * unwiring.  first ensure that the range to be unwired is
                       * really wired down and that there are no holes.
                       */
      
    8                 while ((entry != &map->header) && (entry->start < end)) {
    8                         if (entry->wired_count == 0 ||
    5                             (entry->end < end &&
                                   (entry->next == &map->header ||
    2                               entry->next->start > entry->end))) {
    7                                 if ((lockflags & UVM_LK_EXIT) == 0)
    7                                         vm_map_unlock(map);
                                      UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0);
                                      return EINVAL;
                              }
                              entry = entry->next;
                      }
      
                      /*
                       * POSIX 1003.1b - a single munlock call unlocks a region,
                       * regardless of the number of mlock calls made on that
                       * region.
                       */
      
    4                 entry = start_entry;
    4                 while ((entry != &map->header) && (entry->start < end)) {
    4                         UVM_MAP_CLIP_END(map, entry, end);
    4                         if (VM_MAPENT_ISWIRED(entry))
    4                                 uvm_map_entry_unwire(map, entry);
    4                         entry = entry->next;
                      }
                      if ((lockflags & UVM_LK_EXIT) == 0)
                              vm_map_unlock(map);
                      UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
                      return 0;
              }
      
              /*
               * wire case: in two passes [XXXCDC: ugly block of code here]
               *
               * 1: holding the write lock, we create any anonymous maps that need
               *    to be created.  then we clip each map entry to the region to
               *    be wired and increment its wiring count.
               *
               * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
               *    in the pages for any newly wired area (wired_count == 1).
               *
               *    downgrading to a read lock for uvm_fault_wire avoids a possible
               *    deadlock with another thread that may have faulted on one of
               *    the pages to be wired (it would mark the page busy, blocking
               *    us, then in turn block on the map lock that we hold).  because
               *    of problems in the recursive lock package, we cannot upgrade
               *    to a write lock in vm_map_lookup.  thus, any actions that
               *    require the write lock must be done beforehand.  because we
               *    keep the read lock on the map, the copy-on-write status of the
               *    entries we modify here cannot change.
               */
      
   37         while ((entry != &map->header) && (entry->start < end)) {
   37                 if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
      
                              /*
                               * perform actions of vm_map_lookup that need the
                               * write lock on the map: create an anonymous map
                               * for a copy-on-write region, or an anonymous map
                               * for a zero-fill region.  (XXXCDC: submap case
                               * ok?)
                               */
      
   37                         if (!UVM_ET_ISSUBMAP(entry)) {  /* not submap */
   37                                 if (UVM_ET_ISNEEDSCOPY(entry) &&
   36                                     ((entry->max_protection & VM_PROT_WRITE) ||
    7                                      (entry->object.uvm_obj == NULL))) {
   31                                         amap_copy(map, entry, 0, start, end);
                                              /* XXXCDC: wait OK? */
                                      }
                              }
                      }
   37                 UVM_MAP_CLIP_START(map, entry, start);
   37                 UVM_MAP_CLIP_END(map, entry, end);
   37                 entry->wired_count++;
      
                      /*
                       * Check for holes
                       */
      
                      if (entry->protection == VM_PROT_NONE ||
   37                     (entry->end < end &&
                           (entry->next == &map->header ||
    6                       entry->next->start > entry->end))) {
      
                              /*
                               * found one.  amap creation actions do not need to
                               * be undone, but the wired counts need to be restored.
                               */
      
    2                         while (entry != &map->header && entry->end > start) {
    2                                 entry->wired_count--;
                                      entry = entry->prev;
                              }
                              if ((lockflags & UVM_LK_EXIT) == 0)
                                      vm_map_unlock(map);
                              UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0);
                              return EINVAL;
                      }
                      entry = entry->next;
              }
      
              /*
               * Pass 2.
               */
      
      #ifdef DIAGNOSTIC
   36         timestamp_save = map->timestamp;
      #endif
              vm_map_busy(map);
              vm_map_unlock(map);
      
              rv = 0;
              entry = start_entry;
   36         while (entry != &map->header && entry->start < end) {
   36                 if (entry->wired_count == 1) {
   36                         rv = uvm_fault_wire(map, entry->start, entry->end,
                                  entry->max_protection, 1);
                              if (rv) {
      
                                      /*
                                       * wiring failed.  break out of the loop.
                                       * we'll clean up the map below, once we
                                       * have a write lock again.
                                       */
      
                                      break;
                              }
                      }
   35                 entry = entry->next;
              }
      
              if (rv) {        /* failed? */
      
                      /*
                       * Get back to an exclusive (write) lock.
                       */
      
    1                 vm_map_lock(map);
                      vm_map_unbusy(map);
      
      #ifdef DIAGNOSTIC
                      if (timestamp_save + 1 != map->timestamp)
                              panic("uvm_map_pageable: stale map");
      #endif
      
                      /*
                       * first drop the wiring count on all the entries
                       * which haven't actually been wired yet.
                       */
      
                      failed_entry = entry;
    1                 while (entry != &map->header && entry->start < end) {
    1                         entry->wired_count--;
                              entry = entry->next;
                      }
      
                      /*
                       * now, unwire all the entries that were successfully
                       * wired above.
                       */
      
    1                 entry = start_entry;
                      while (entry != failed_entry) {
                              entry->wired_count--;
                              if (VM_MAPENT_ISWIRED(entry) == 0)
                                      uvm_map_entry_unwire(map, entry);
                              entry = entry->next;
                      }
    1                 if ((lockflags & UVM_LK_EXIT) == 0)
    5                         vm_map_unlock(map);
                      UVMHIST_LOG(maphist, "<- done (RV=%jd)", rv,0,0,0);
                      return (rv);
              }
      
   35         if ((lockflags & UVM_LK_EXIT) == 0) {
   33                 vm_map_unbusy(map);
              } else {
      
                      /*
                       * Get back to an exclusive (write) lock.
                       */
      
    2                 vm_map_lock(map);
                      vm_map_unbusy(map);
              }
      
              UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
              return 0;
      }
      
      /*
       * uvm_map_pageable_all: special case of uvm_map_pageable - affects
       * all mapped regions.
       *
       * => map must not be locked.
       * => if no flags are specified, all regions are unwired.
       * => XXXJRT: has some of the same problems as uvm_map_pageable() above.
       */
      
      int
      uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
      {
              struct vm_map_entry *entry, *failed_entry;
              vsize_t size;
              int rv;
      #ifdef DIAGNOSTIC
              u_int timestamp_save;
      #endif
              UVMHIST_FUNC("uvm_map_pageable_all"); UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist,"(map=%#jx,flags=%#jx)", (uintptr_t)map, flags,
                  0, 0);
      
    4         KASSERT(map->flags & VM_MAP_PAGEABLE);
      
    4         vm_map_lock(map);
      
              /*
               * handle wiring and unwiring separately.
               */
      
              if (flags == 0) {                        /* unwire */
      
                      /*
                       * POSIX 1003.1b -- munlockall unlocks all regions,
                       * regardless of how many times mlockall has been called.
                       */
      
    1                 for (entry = map->header.next; entry != &map->header;
    1                      entry = entry->next) {
    1                         if (VM_MAPENT_ISWIRED(entry))
    1                                 uvm_map_entry_unwire(map, entry);
                      }
    1                 map->flags &= ~VM_MAP_WIREFUTURE;
                      vm_map_unlock(map);
                      UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
                      return 0;
              }
      
    3         if (flags & MCL_FUTURE) {
      
                      /*
                       * must wire all future mappings; remember this.
                       */
      
    2                 map->flags |= VM_MAP_WIREFUTURE;
              }
      
    3         if ((flags & MCL_CURRENT) == 0) {
      
                      /*
                       * no more work to do!
                       */
      
                      UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
                      vm_map_unlock(map);
                      return 0;
              }
      
              /*
               * wire case: in three passes [XXXCDC: ugly block of code here]
               *
               * 1: holding the write lock, count all pages mapped by non-wired
               *    entries.  if this would cause us to go over our limit, we fail.
               *
               * 2: still holding the write lock, we create any anonymous maps that
               *    need to be created.  then we increment its wiring count.
               *
               * 3: we downgrade to a read lock, and call uvm_fault_wire to fault
               *    in the pages for any newly wired area (wired_count == 1).
               *
               *    downgrading to a read lock for uvm_fault_wire avoids a possible
               *    deadlock with another thread that may have faulted on one of
               *    the pages to be wired (it would mark the page busy, blocking
               *    us, then in turn block on the map lock that we hold).  because
               *    of problems in the recursive lock package, we cannot upgrade
               *    to a write lock in vm_map_lookup.  thus, any actions that
               *    require the write lock must be done beforehand.  because we
               *    keep the read lock on the map, the copy-on-write status of the
               *    entries we modify here cannot change.
               */
      
    1         for (size = 0, entry = map->header.next; entry != &map->header;
    1              entry = entry->next) {
    1                 if (entry->protection != VM_PROT_NONE &&
    1                     VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
    1                         size += entry->end - entry->start;
                      }
              }
      
    1         if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
    1                 vm_map_unlock(map);
                      return ENOMEM;
              }
      
    1         if (limit != 0 &&
    1             (size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
                      vm_map_unlock(map);
                      return ENOMEM;
              }
      
              /*
               * Pass 2.
               */
      
              for (entry = map->header.next; entry != &map->header;
                   entry = entry->next) {
                      if (entry->protection == VM_PROT_NONE)
                              continue;
                      if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
      
                              /*
                               * perform actions of vm_map_lookup that need the
                               * write lock on the map: create an anonymous map
                               * for a copy-on-write region, or an anonymous map
                               * for a zero-fill region.  (XXXCDC: submap case
                               * ok?)
                               */
      
                              if (!UVM_ET_ISSUBMAP(entry)) {        /* not submap */
                                      if (UVM_ET_ISNEEDSCOPY(entry) &&
                                          ((entry->max_protection & VM_PROT_WRITE) ||
                                           (entry->object.uvm_obj == NULL))) {
                                              amap_copy(map, entry, 0, entry->start,
                                                  entry->end);
                                              /* XXXCDC: wait OK? */
                                      }
                              }
                      }
                      entry->wired_count++;
              }
      
              /*
               * Pass 3.
               */
      
      #ifdef DIAGNOSTIC
              timestamp_save = map->timestamp;
      #endif
              vm_map_busy(map);
              vm_map_unlock(map);
      
              rv = 0;
              for (entry = map->header.next; entry != &map->header;
                   entry = entry->next) {
                      if (entry->wired_count == 1) {
                              rv = uvm_fault_wire(map, entry->start, entry->end,
                                  entry->max_protection, 1);
                              if (rv) {
      
                                      /*
                                       * wiring failed.  break out of the loop.
                                       * we'll clean up the map below, once we
                                       * have a write lock again.
                                       */
      
                                      break;
                              }
                      }
              }
      
              if (rv) {
      
                      /*
                       * Get back an exclusive (write) lock.
                       */
      
                      vm_map_lock(map);
                      vm_map_unbusy(map);
      
      #ifdef DIAGNOSTIC
                      if (timestamp_save + 1 != map->timestamp)
                              panic("uvm_map_pageable_all: stale map");
      #endif
      
                      /*
                       * first drop the wiring count on all the entries
                       * which haven't actually been wired yet.
                       *
                       * Skip VM_PROT_NONE entries like we did above.
                       */
      
                      failed_entry = entry;
                      for (/* nothing */; entry != &map->header;
                           entry = entry->next) {
                              if (entry->protection == VM_PROT_NONE)
                                      continue;
                              entry->wired_count--;
                      }
      
                      /*
                       * now, unwire all the entries that were successfully
                       * wired above.
                       *
                       * Skip VM_PROT_NONE entries like we did above.
                       */
      
                      for (entry = map->header.next; entry != failed_entry;
                           entry = entry->next) {
                              if (entry->protection == VM_PROT_NONE)
                                      continue;
                              entry->wired_count--;
                              if (VM_MAPENT_ISWIRED(entry))
                                      uvm_map_entry_unwire(map, entry);
                      }
    2                 vm_map_unlock(map);
                      UVMHIST_LOG(maphist,"<- done (RV=%jd)", rv,0,0,0);
    4                 return (rv);
              }
      
              vm_map_unbusy(map);
      
              UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
              return 0;
      }
      
      /*
       * uvm_map_clean: clean out a map range
       *
       * => valid flags:
       *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
       *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
       *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
       *   if (flags & PGO_FREE): any cached pages are freed after clean
       * => returns an error if any part of the specified range isn't mapped
       * => never a need to flush amap layer since the anonymous memory has
       *        no permanent home, but may deactivate pages there
       * => called from sys_msync() and sys_madvise()
       * => caller must not write-lock map (read OK).
       * => we may sleep while cleaning if SYNCIO [with map read-locked]
       */
      
      int
      uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
      {
   15         struct vm_map_entry *current, *entry;
              struct uvm_object *uobj;
              struct vm_amap *amap;
              struct vm_anon *anon, *anon_tofree;
              struct vm_page *pg;
              vaddr_t offset;
              vsize_t size;
              voff_t uoff;
              int error, refs;
              UVMHIST_FUNC("uvm_map_clean"); UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist,"(map=%#jx,start=%#jx,end=%#jx,flags=%#jx)",
                  (uintptr_t)map, start, end, flags);
              KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
                      (PGO_FREE|PGO_DEACTIVATE));
      
   15         vm_map_lock_read(map);
              VM_MAP_RANGE_CHECK(map, start, end);
              if (uvm_map_lookup_entry(map, start, &entry) == false) {
                      vm_map_unlock_read(map);
                      return EFAULT;
              }
      
              /*
               * Make a first pass to check for holes and wiring problems.
               */
      
   15         for (current = entry; current->start < end; current = current->next) {
   15                 if (UVM_ET_ISSUBMAP(current)) {
                              vm_map_unlock_read(map);
                              return EINVAL;
                      }
   15                 if ((flags & PGO_FREE) != 0 && VM_MAPENT_ISWIRED(entry)) {
    1                         vm_map_unlock_read(map);
                              return EBUSY;
                      }
   14                 if (end <= current->end) {
                              break;
                      }
   13                 if (current->end != current->next->start) {
    2                         vm_map_unlock_read(map);
                              return EFAULT;
                      }
              }
      
              error = 0;
   13         for (current = entry; start < end; current = current->next) {
   13                 amap = current->aref.ar_amap;        /* upper layer */
                      uobj = current->object.uvm_obj;        /* lower layer */
                      KASSERT(start >= current->start);
      
                      /*
                       * No amap cleaning necessary if:
                       *
                       *        (1) There's no amap.
                       *
                       *        (2) We're not deactivating or freeing pages.
                       */
      
   13                 if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
                              goto flush_object;
      
   11                 offset = start - current->start;
                      size = MIN(end, current->end) - start;
                      anon_tofree = NULL;
      
                      amap_lock(amap);
   11                 for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) {
   11                         anon = amap_lookup(&current->aref, offset);
                              if (anon == NULL)
                                      continue;
      
    8                         KASSERT(anon->an_lock == amap->am_lock);
    8                         pg = anon->an_page;
                              if (pg == NULL) {
                                      continue;
                              }
    8                         if (pg->flags & PG_BUSY) {
                                      continue;
                              }
      
    8                         switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
      
                              /*
                               * In these first 3 cases, we just deactivate the page.
                               */
      
                              case PGO_CLEANIT|PGO_FREE:
                              case PGO_CLEANIT|PGO_DEACTIVATE:
                              case PGO_DEACTIVATE:
    5  deactivate_it:
                                      /*
                                       * skip the page if it's loaned or wired,
                                       * since it shouldn't be on a paging queue
                                       * at all in these cases.
                                       */
      
                                      mutex_enter(&uvm_pageqlock);
                                      if (pg->loan_count != 0 ||
                                          pg->wire_count != 0) {
    1                                         mutex_exit(&uvm_pageqlock);
                                              continue;
                                      }
    4                                 KASSERT(pg->uanon == anon);
    4                                 uvm_pagedeactivate(pg);
                                      mutex_exit(&uvm_pageqlock);
                                      continue;
      
                              case PGO_FREE:
      
                                      /*
                                       * If there are multiple references to
                                       * the amap, just deactivate the page.
                                       */
      
    6                                 if (amap_refs(amap) > 1)
                                              goto deactivate_it;
      
                                      /* skip the page if it's wired */
    6                                 if (pg->wire_count != 0) {
                                              continue;
                                      }
    5                                 amap_unadd(&current->aref, offset);
                                      refs = --anon->an_ref;
                                      if (refs == 0) {
    5                                         anon->an_link = anon_tofree;
                                              anon_tofree = anon;
                                      }
                                      continue;
                              }
                      }
   11                 uvm_anon_freelst(amap, anon_tofree);
      
   13  flush_object:
                      /*
                       * flush pages if we've got a valid backing object.
                       * note that we must always clean object pages before
                       * freeing them since otherwise we could reveal stale
                       * data from files.
                       */
      
    8                 uoff = current->offset + (start - current->start);
                      size = MIN(end, current->end) - start;
                      if (uobj != NULL) {
                              mutex_enter(uobj->vmobjlock);
                              if (uobj->pgops->pgo_put != NULL)
                                      error = (uobj->pgops->pgo_put)(uobj, uoff,
                                          uoff + size, flags | PGO_CLEANIT);
                              else
                                      error = 0;
                      }
                      start += size;
              }
   13         vm_map_unlock_read(map);
   15         return (error);
      }
      
      
      /*
       * uvm_map_checkprot: check protection in map
       *
       * => must allow specified protection in a fully allocated region.
       * => map must be read or write locked by caller.
       */
      
      bool
      uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
          vm_prot_t protection)
      {
              struct vm_map_entry *entry;
              struct vm_map_entry *tmp_entry;
      
              if (!uvm_map_lookup_entry(map, start, &tmp_entry)) {
                      return (false);
              }
              entry = tmp_entry;
              while (start < end) {
                      if (entry == &map->header) {
                              return (false);
                      }
      
                      /*
                       * no holes allowed
                       */
      
                      if (start < entry->start) {
                              return (false);
                      }
      
                      /*
                       * check protection associated with entry
                       */
      
                      if ((entry->protection & protection) != protection) {
                              return (false);
                      }
                      start = entry->end;
                      entry = entry->next;
              }
              return (true);
      }
      
      /*
       * uvmspace_alloc: allocate a vmspace structure.
       *
       * - structure includes vm_map and pmap
       * - XXX: no locking on this structure
       * - refcnt set to 1, rest must be init'd by caller
       */
      struct vmspace *
      uvmspace_alloc(vaddr_t vmin, vaddr_t vmax, bool topdown)
      {
              struct vmspace *vm;
              UVMHIST_FUNC("uvmspace_alloc"); UVMHIST_CALLED(maphist);
      
   65         vm = pool_cache_get(&uvm_vmspace_cache, PR_WAITOK);
              uvmspace_init(vm, NULL, vmin, vmax, topdown);
              UVMHIST_LOG(maphist,"<- done (vm=%#jx)", (uintptr_t)vm, 0, 0, 0);
              return (vm);
      }
      
      /*
       * uvmspace_init: initialize a vmspace structure.
       *
       * - XXX: no locking on this structure
       * - refcnt set to 1, rest must be init'd by caller
       */
      void
      uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin,
          vaddr_t vmax, bool topdown)
      {
              UVMHIST_FUNC("uvmspace_init"); UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist, "(vm=%#jx, pmap=%#jx, vmin=%#jx, vmax=%#jx",
                  (uintptr_t)vm, (uintptr_t)pmap, vmin, vmax);
              UVMHIST_LOG(maphist, "   topdown=%ju)", topdown, 0, 0, 0);
      
   65         memset(vm, 0, sizeof(*vm));
   65         uvm_map_setup(&vm->vm_map, vmin, vmax, VM_MAP_PAGEABLE
                  | (topdown ? VM_MAP_TOPDOWN : 0)
                  );
              if (pmap)
                      pmap_reference(pmap);
              else
   65                 pmap = pmap_create();
   65         vm->vm_map.pmap = pmap;
              vm->vm_refcnt = 1;
              UVMHIST_LOG(maphist,"<- done",0,0,0,0);
      }
      
      /*
       * uvmspace_share: share a vmspace between two processes
       *
       * - used for vfork, threads(?)
       */
      
      void
      uvmspace_share(struct proc *p1, struct proc *p2)
      {
      
    1         uvmspace_addref(p1->p_vmspace);
              p2->p_vmspace = p1->p_vmspace;
      }
      
      #if 0
      
      /*
       * uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace
       *
       * - XXX: no locking on vmspace
       */
      
      void
      uvmspace_unshare(struct lwp *l)
      {
              struct proc *p = l->l_proc;
              struct vmspace *nvm, *ovm = p->p_vmspace;
      
              if (ovm->vm_refcnt == 1)
                      /* nothing to do: vmspace isn't shared in the first place */
                      return;
      
              /* make a new vmspace, still holding old one */
              nvm = uvmspace_fork(ovm);
      
              kpreempt_disable();
              pmap_deactivate(l);                /* unbind old vmspace */
              p->p_vmspace = nvm;
              pmap_activate(l);                /* switch to new vmspace */
              kpreempt_enable();
      
              uvmspace_free(ovm);                /* drop reference to old vmspace */
      }
      
      #endif
      
      
      /*
       * uvmspace_spawn: a new process has been spawned and needs a vmspace
       */
      
      void
      uvmspace_spawn(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
      {
              struct proc *p = l->l_proc;
              struct vmspace *nvm;
      
      #ifdef __HAVE_CPU_VMSPACE_EXEC
              cpu_vmspace_exec(l, start, end);
      #endif
      
              nvm = uvmspace_alloc(start, end, topdown);
              kpreempt_disable();
              p->p_vmspace = nvm;
              pmap_activate(l);
              kpreempt_enable();
      }
      
      /*
       * uvmspace_exec: the process wants to exec a new program
       */
      
      void
      uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
      {
              struct proc *p = l->l_proc;
              struct vmspace *nvm, *ovm = p->p_vmspace;
              struct vm_map *map;
      
              KASSERT(ovm != NULL);
      #ifdef __HAVE_CPU_VMSPACE_EXEC
              cpu_vmspace_exec(l, start, end);
      #endif
      
              map = &ovm->vm_map;
              /*
               * see if more than one process is using this vmspace...
               */
      
              if (ovm->vm_refcnt == 1
                  && topdown == ((ovm->vm_map.flags & VM_MAP_TOPDOWN) != 0)) {
      
                      /*
                       * if p is the only process using its vmspace then we can safely
                       * recycle that vmspace for the program that is being exec'd.
                       * But only if TOPDOWN matches the requested value for the new
                       * vm space!
                       */
      
                      /*
                       * SYSV SHM semantics require us to kill all segments on an exec
                       */
                      if (uvm_shmexit && ovm->vm_shm)
                              (*uvm_shmexit)(ovm);
      
                      /*
                       * POSIX 1003.1b -- "lock future mappings" is revoked
                       * when a process execs another program image.
                       */
      
                      map->flags &= ~VM_MAP_WIREFUTURE;
      
                      /*
                       * now unmap the old program
                       */
      
                      pmap_remove_all(map->pmap);
                      uvm_unmap(map, vm_map_min(map), vm_map_max(map));
                      KASSERT(map->header.prev == &map->header);
                      KASSERT(map->nentries == 0);
      
                      /*
                       * resize the map
                       */
      
                      vm_map_setmin(map, start);
                      vm_map_setmax(map, end);
              } else {
      
                      /*
                       * p's vmspace is being shared, so we can't reuse it for p since
                       * it is still being used for others.   allocate a new vmspace
                       * for p
                       */
      
                      nvm = uvmspace_alloc(start, end, topdown);
      
                      /*
                       * install new vmspace and drop our ref to the old one.
                       */
      
                      kpreempt_disable();
                      pmap_deactivate(l);
                      p->p_vmspace = nvm;
                      pmap_activate(l);
                      kpreempt_enable();
      
                      uvmspace_free(ovm);
              }
      }
      
      /*
       * uvmspace_addref: add a referece to a vmspace.
       */
      
      void
      uvmspace_addref(struct vmspace *vm)
      {
              struct vm_map *map = &vm->vm_map;
      
    1         KASSERT((map->flags & VM_MAP_DYING) == 0);
      
    1         mutex_enter(&map->misc_lock);
              KASSERT(vm->vm_refcnt > 0);
    1         vm->vm_refcnt++;
              mutex_exit(&map->misc_lock);
      }
      
      /*
       * uvmspace_free: free a vmspace data structure
       */
      
      void
      uvmspace_free(struct vmspace *vm)
      {
    5         struct vm_map_entry *dead_entries;
              struct vm_map *map = &vm->vm_map;
              int n;
      
              UVMHIST_FUNC("uvmspace_free"); UVMHIST_CALLED(maphist);
      
              UVMHIST_LOG(maphist,"(vm=%#jx) ref=%jd", (uintptr_t)vm, vm->vm_refcnt,
                  0, 0);
              mutex_enter(&map->misc_lock);
              n = --vm->vm_refcnt;
              mutex_exit(&map->misc_lock);
              if (n > 0)
    5                 return;
      
              /*
               * at this point, there should be no other references to the map.
               * delete all of the mappings, then destroy the pmap.
               */
      
              map->flags |= VM_MAP_DYING;
              pmap_remove_all(map->pmap);
      
              /* Get rid of any SYSV shared memory segments. */
              if (uvm_shmexit && vm->vm_shm != NULL)
                      (*uvm_shmexit)(vm);
      
              if (map->nentries) {
                      uvm_unmap_remove(map, vm_map_min(map), vm_map_max(map),
                          &dead_entries, 0);
                      if (dead_entries != NULL)
                              uvm_unmap_detach(dead_entries, 0);
              }
              KASSERT(map->nentries == 0);
              KASSERT(map->size == 0);
      
              mutex_destroy(&map->misc_lock);
              rw_destroy(&map->lock);
              cv_destroy(&map->cv);
              pmap_destroy(map->pmap);
              pool_cache_put(&uvm_vmspace_cache, vm);
      }
      
      static struct vm_map_entry *
      uvm_mapent_clone(struct vm_map *new_map, struct vm_map_entry *old_entry,
          int flags)
      {
              struct vm_map_entry *new_entry;
      
   65         new_entry = uvm_mapent_alloc(new_map, 0);
              /* old_entry -> new_entry */
   65         uvm_mapent_copy(old_entry, new_entry);
      
              /* new pmap has nothing wired in it */
              new_entry->wired_count = 0;
      
              /*
               * gain reference to object backing the map (can't
               * be a submap, already checked this case).
               */
      
              if (new_entry->aref.ar_amap)
   65                 uvm_map_reference_amap(new_entry, flags);
      
   65         if (new_entry->object.uvm_obj &&
   65             new_entry->object.uvm_obj->pgops->pgo_reference)
   65                 new_entry->object.uvm_obj->pgops->pgo_reference(
                              new_entry->object.uvm_obj);
      
              /* insert entry at end of new_map's entry list */
   65         uvm_map_entry_link(new_map, new_map->header.prev,
                  new_entry);
      
              return new_entry;
      }
      
      /*
       * share the mapping: this means we want the old and
       * new entries to share amaps and backing objects.
       */
      static void
      uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map,
          struct vm_map_entry *old_entry)
      {
              /*
               * if the old_entry needs a new amap (due to prev fork)
               * then we need to allocate it now so that we have
               * something we own to share with the new_entry.   [in
               * other words, we need to clear needs_copy]
               */
      
   65         if (UVM_ET_ISNEEDSCOPY(old_entry)) {
                      /* get our own amap, clears needs_copy */
                      amap_copy(old_map, old_entry, AMAP_COPY_NOCHUNK,
                          0, 0);
                      /* XXXCDC: WAITOK??? */
              }
      
   65         uvm_mapent_clone(new_map, old_entry, AMAP_SHARED);
      }
      
      
      static void
      uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map,
          struct vm_map_entry *old_entry)
      {
              struct vm_map_entry *new_entry;
      
              /*
               * copy-on-write the mapping (using mmap's
               * MAP_PRIVATE semantics)
               *
               * allocate new_entry, adjust reference counts.
               * (note that new references are read-only).
               */
      
   65         new_entry = uvm_mapent_clone(new_map, old_entry, 0);
      
              new_entry->etype |=
                  (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
      
              /*
               * the new entry will need an amap.  it will either
               * need to be copied from the old entry or created
               * from scratch (if the old entry does not have an
               * amap).  can we defer this process until later
               * (by setting "needs_copy") or do we need to copy
               * the amap now?
               *
               * we must copy the amap now if any of the following
               * conditions hold:
               * 1. the old entry has an amap and that amap is
               *    being shared.  this means that the old (parent)
               *    process is sharing the amap with another
               *    process.  if we do not clear needs_copy here
               *    we will end up in a situation where both the
               *    parent and child process are refering to the
               *    same amap with "needs_copy" set.  if the
               *    parent write-faults, the fault routine will
               *    clear "needs_copy" in the parent by allocating
               *    a new amap.   this is wrong because the
               *    parent is supposed to be sharing the old amap
               *    and the new amap will break that.
               *
               * 2. if the old entry has an amap and a non-zero
               *    wire count then we are going to have to call
               *    amap_cow_now to avoid page faults in the
               *    parent process.   since amap_cow_now requires
               *    "needs_copy" to be clear we might as well
               *    clear it here as well.
               *
               */
      
              if (old_entry->aref.ar_amap != NULL) {
   65                 if ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 ||
   65                     VM_MAPENT_ISWIRED(old_entry)) {
      
    3                         amap_copy(new_map, new_entry,
                                  AMAP_COPY_NOCHUNK, 0, 0);
                              /* XXXCDC: M_WAITOK ... ok? */
                      }
              }
      
              /*
               * if the parent's entry is wired down, then the
               * parent process does not want page faults on
               * access to that memory.  this means that we
               * cannot do copy-on-write because we can't write
               * protect the old entry.   in this case we
               * resolve all copy-on-write faults now, using
               * amap_cow_now.   note that we have already
               * allocated any needed amap (above).
               */
      
   65         if (VM_MAPENT_ISWIRED(old_entry)) {
      
                      /*
                       * resolve all copy-on-write faults now
                       * (note that there is nothing to do if
                       * the old mapping does not have an amap).
                       */
    3                 if (old_entry->aref.ar_amap)
    3                         amap_cow_now(new_map, new_entry);
      
              } else {
                      /*
                       * setup mappings to trigger copy-on-write faults
                       * we must write-protect the parent if it has
                       * an amap and it is not already "needs_copy"...
                       * if it is already "needs_copy" then the parent
                       * has already been write-protected by a previous
                       * fork operation.
                       */
   65                 if (old_entry->aref.ar_amap &&
   65                     !UVM_ET_ISNEEDSCOPY(old_entry)) {
   65                         if (old_entry->max_protection & VM_PROT_WRITE) {
   65                                 uvm_map_lock_entry(old_entry);
   65                                 pmap_protect(old_map->pmap,
                                          old_entry->start, old_entry->end,
                                          old_entry->protection & ~VM_PROT_WRITE);
   65                                 uvm_map_unlock_entry(old_entry);
                              }
   65                         old_entry->etype |= UVM_ET_NEEDSCOPY;
                      }
              }
      }
      
      /*
       * zero the mapping: the new entry will be zero initialized
       */
      static void
      uvm_mapent_forkzero(struct vm_map *new_map, struct vm_map *old_map,
          struct vm_map_entry *old_entry)
      {
              struct vm_map_entry *new_entry;
      
   65         new_entry = uvm_mapent_clone(new_map, old_entry, 0);
      
              new_entry->etype |=
                  (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
      
              if (new_entry->aref.ar_amap) {
                      uvm_map_unreference_amap(new_entry, 0);
                      new_entry->aref.ar_pageoff = 0;
                      new_entry->aref.ar_amap = NULL;
              }
      
   65         if (UVM_ET_ISOBJ(new_entry)) {
                      if (new_entry->object.uvm_obj->pgops->pgo_detach)
                              new_entry->object.uvm_obj->pgops->pgo_detach(
                                  new_entry->object.uvm_obj);
                      new_entry->object.uvm_obj = NULL;
                      new_entry->etype &= ~UVM_ET_OBJ;
              }
      }
      
      /*
       *   F O R K   -   m a i n   e n t r y   p o i n t
       */
      /*
       * uvmspace_fork: fork a process' main map
       *
       * => create a new vmspace for child process from parent.
       * => parent's map must not be locked.
       */
      
      struct vmspace *
      uvmspace_fork(struct vmspace *vm1)
      {
              struct vmspace *vm2;
   65         struct vm_map *old_map = &vm1->vm_map;
              struct vm_map *new_map;
              struct vm_map_entry *old_entry;
              UVMHIST_FUNC("uvmspace_fork"); UVMHIST_CALLED(maphist);
      
              vm_map_lock(old_map);
      
              vm2 = uvmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
                  vm1->vm_map.flags & VM_MAP_TOPDOWN);
              memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
                  (char *) (vm1 + 1) - (char *) &vm1->vm_startcopy);
              new_map = &vm2->vm_map;                  /* XXX */
      
              old_entry = old_map->header.next;
              new_map->size = old_map->size;
      
              /*
               * go entry-by-entry
               */
      
              while (old_entry != &old_map->header) {
      
                      /*
                       * first, some sanity checks on the old entry
                       */
      
   65                 KASSERT(!UVM_ET_ISSUBMAP(old_entry));
   65                 KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) ||
                              !UVM_ET_ISNEEDSCOPY(old_entry));
      
   65                 switch (old_entry->inheritance) {
                      case MAP_INHERIT_NONE:
                              /*
                               * drop the mapping, modify size
                               */
   65                         new_map->size -= old_entry->end - old_entry->start;
                              break;
      
                      case MAP_INHERIT_SHARE:
   65                         uvm_mapent_forkshared(new_map, old_map, old_entry);
                              break;
      
                      case MAP_INHERIT_COPY:
   65                         uvm_mapent_forkcopy(new_map, old_map, old_entry);
                              break;
      
                      case MAP_INHERIT_ZERO:
   65                         uvm_mapent_forkzero(new_map, old_map, old_entry);
                              break;
                      default:
                              KASSERT(0);
                              break;
                      }
   65                 old_entry = old_entry->next;
              }
      
   65         pmap_update(old_map->pmap);
              vm_map_unlock(old_map);
      
   65         if (uvm_shmfork && vm1->vm_shm)
    5                 (*uvm_shmfork)(vm1, vm2);
      
      #ifdef PMAP_FORK
   65         pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap);
      #endif
      
              UVMHIST_LOG(maphist,"<- done",0,0,0,0);
              return (vm2);
      }
      
      
      /*
       * uvm_mapent_trymerge: try to merge an entry with its neighbors.
       *
       * => called with map locked.
       * => return non zero if successfully merged.
       */
      
      int
      uvm_mapent_trymerge(struct vm_map *map, struct vm_map_entry *entry, int flags)
      {
              struct uvm_object *uobj;
              struct vm_map_entry *next;
              struct vm_map_entry *prev;
              vsize_t size;
              int merged = 0;
              bool copying;
              int newetype;
      
  116         if (entry->aref.ar_amap != NULL) {
  116                 return 0;
              }
  116         if ((entry->flags & UVM_MAP_NOMERGE) != 0) {
                      return 0;
              }
      
  116         uobj = entry->object.uvm_obj;
              size = entry->end - entry->start;
              copying = (flags & UVM_MERGE_COPYING) != 0;
  116         newetype = copying ? (entry->etype & ~UVM_ET_NEEDSCOPY) : entry->etype;
      
  116         next = entry->next;
              if (next != &map->header &&
  116             next->start == entry->end &&
  111             ((copying && next->aref.ar_amap != NULL &&
   18             amap_refs(next->aref.ar_amap) == 1) ||
                  (!copying && next->aref.ar_amap == NULL)) &&
   12             UVM_ET_ISCOMPATIBLE(next, newetype,
                  uobj, entry->flags, entry->protection,
                  entry->max_protection, entry->inheritance, entry->advice,
    7             entry->wired_count) &&
                  (uobj == NULL || entry->offset + size == next->offset)) {
                      int error;
      
    7                 if (copying) {
    7                         error = amap_extend(next, size,
                                  AMAP_EXTEND_NOWAIT|AMAP_EXTEND_BACKWARDS);
                      } else {
                              error = 0;
                      }
                      if (error == 0) {
    7                         if (uobj) {
                                      if (uobj->pgops->pgo_detach) {
                                              uobj->pgops->pgo_detach(uobj);
                                      }
                              }
      
    7                         entry->end = next->end;
    7                         clear_hints(map, next);
    7                         uvm_map_entry_unlink(map, next);
                              if (copying) {
    7                                 entry->aref = next->aref;
                                      entry->etype &= ~UVM_ET_NEEDSCOPY;
                              }
    7                         uvm_map_check(map, "trymerge forwardmerge");
                              uvm_mapent_free(next);
                              merged++;
                      }
              }
      
  116         prev = entry->prev;
              if (prev != &map->header &&
  116             prev->end == entry->start &&
   54             ((copying && !merged && prev->aref.ar_amap != NULL &&
   35             amap_refs(prev->aref.ar_amap) == 1) ||
                  (!copying && prev->aref.ar_amap == NULL)) &&
   29             UVM_ET_ISCOMPATIBLE(prev, newetype,
                  uobj, entry->flags, entry->protection,
                  entry->max_protection, entry->inheritance, entry->advice,
   27             entry->wired_count) &&
                  (uobj == NULL ||
    1             prev->offset + prev->end - prev->start == entry->offset)) {
                      int error;
      
   27                 if (copying) {
   27                         error = amap_extend(prev, size,
                                  AMAP_EXTEND_NOWAIT|AMAP_EXTEND_FORWARDS);
                      } else {
                              error = 0;
                      }
                      if (error == 0) {
   26                         if (uobj) {
    1                                 if (uobj->pgops->pgo_detach) {
    1                                         uobj->pgops->pgo_detach(uobj);
                                      }
    1                                 entry->offset = prev->offset;
                              }
      
   26                         entry->start = prev->start;
   26                         clear_hints(map, prev);
   26                         uvm_map_entry_unlink(map, prev);
                              if (copying) {
   26                                 entry->aref = prev->aref;
                                      entry->etype &= ~UVM_ET_NEEDSCOPY;
                              }
   26                         uvm_map_check(map, "trymerge backmerge");
                              uvm_mapent_free(prev);
                              merged++;
                      }
              }
      
              return merged;
      }
      
      /*
       * uvm_map_setup: init map
       *
       * => map must not be in service yet.
       */
      
      void
      uvm_map_setup(struct vm_map *map, vaddr_t vmin, vaddr_t vmax, int flags)
      {
      
   65         rb_tree_init(&map->rb_tree, &uvm_map_tree_ops);
              map->header.next = map->header.prev = &map->header;
              map->nentries = 0;
              map->size = 0;
              map->ref_count = 1;
              vm_map_setmin(map, vmin);
              vm_map_setmax(map, vmax);
              map->flags = flags;
              map->first_free = &map->header;
              map->hint = &map->header;
              map->timestamp = 0;
              map->busy = NULL;
      
              rw_init(&map->lock);
              cv_init(&map->cv, "vm_map");
              mutex_init(&map->misc_lock, MUTEX_DRIVER, IPL_NONE);
      }
      
      /*
       *   U N M A P   -   m a i n   e n t r y   p o i n t
       */
      
      /*
       * uvm_unmap1: remove mappings from a vm_map (from "start" up to "stop")
       *
       * => caller must check alignment and size
       * => map must be unlocked (we will lock it)
       * => flags is UVM_FLAG_QUANTUM or 0.
       */
      
      void
      uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
      {
   34         struct vm_map_entry *dead_entries;
              UVMHIST_FUNC("uvm_unmap"); UVMHIST_CALLED(maphist);
      
              KASSERTMSG(start < end,
                  "%s: map %p: start %#jx < end %#jx", __func__, map,
                  (uintmax_t)start, (uintmax_t)end);
              UVMHIST_LOG(maphist, "  (map=%#jx, start=%#jx, end=%#jx)",
                  (uintptr_t)map, start, end, 0);
   34         if (map == kernel_map) {
   32                 LOCKDEBUG_MEM_CHECK((void *)start, end - start);
              }
      
              /*
               * work now done by helper functions.   wipe the pmap's and then
               * detach from the dead entries...
               */
   34         vm_map_lock(map);
              uvm_unmap_remove(map, start, end, &dead_entries, flags);
              vm_map_unlock(map);
      
              if (dead_entries != NULL)
   34                 uvm_unmap_detach(dead_entries, 0);
      
   34         UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
      }
      
      
      /*
       * uvm_map_reference: add reference to a map
       *
       * => map need not be locked (we use misc_lock).
       */
      
      void
      uvm_map_reference(struct vm_map *map)
      {
              mutex_enter(&map->misc_lock);
              map->ref_count++;
              mutex_exit(&map->misc_lock);
      }
      
      bool
      vm_map_starved_p(struct vm_map *map)
      {
      
              if ((map->flags & VM_MAP_WANTVA) != 0) {
                      return true;
              }
              /* XXX */
              if ((vm_map_max(map) - vm_map_min(map)) / 16 * 15 < map->size) {
                      return true;
              }
              return false;
      }
      
      void
   75 uvm_map_lock_entry(struct vm_map_entry *entry)
      {
      
  140         if (entry->aref.ar_amap != NULL) {
  131                 amap_lock(entry->aref.ar_amap);
              }
  140         if (UVM_ET_ISOBJ(entry)) {
   75                 mutex_enter(entry->object.uvm_obj->vmobjlock);
              }
      }
      
      void
      uvm_map_unlock_entry(struct vm_map_entry *entry)
      {
      
  140         if (UVM_ET_ISOBJ(entry)) {
   75                 mutex_exit(entry->object.uvm_obj->vmobjlock);
              }
  140         if (entry->aref.ar_amap != NULL) {
  131                 amap_unlock(entry->aref.ar_amap);
              }
      }
      
      #if defined(DDB) || defined(DEBUGPRINT)
      
      /*
       * uvm_map_printit: actually prints the map
       */
      
      void
      uvm_map_printit(struct vm_map *map, bool full,
          void (*pr)(const char *, ...))
      {
              struct vm_map_entry *entry;
      
              (*pr)("MAP %p: [%#lx->%#lx]\n", map, vm_map_min(map),
                  vm_map_max(map));
              (*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=%#x\n",
                  map->nentries, map->size, map->ref_count, map->timestamp,
                  map->flags);
              (*pr)("\tpmap=%p(resident=%ld, wired=%ld)\n", map->pmap,
                  pmap_resident_count(map->pmap), pmap_wired_count(map->pmap));
              if (!full)
                      return;
              for (entry = map->header.next; entry != &map->header;
                  entry = entry->next) {
                      (*pr)(" - %p: %#lx->%#lx: obj=%p/%#llx, amap=%p/%d\n",
                          entry, entry->start, entry->end, entry->object.uvm_obj,
                          (long long)entry->offset, entry->aref.ar_amap,
                          entry->aref.ar_pageoff);
                      (*pr)(
                          "\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
                          "wc=%d, adv=%d\n",
                          (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
                          (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
                          (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
                          entry->protection, entry->max_protection,
                          entry->inheritance, entry->wired_count, entry->advice);
              }
      }
      
      void
      uvm_whatis(uintptr_t addr, void (*pr)(const char *, ...))
      {
              struct vm_map *map;
      
              for (map = kernel_map;;) {
                      struct vm_map_entry *entry;
      
                      if (!uvm_map_lookup_entry_bytree(map, (vaddr_t)addr, &entry)) {
                              break;
                      }
                      (*pr)("%p is %p+%zu from VMMAP %p\n",
                          (void *)addr, (void *)entry->start,
                          (size_t)(addr - (uintptr_t)entry->start), map);
                      if (!UVM_ET_ISSUBMAP(entry)) {
                              break;
                      }
                      map = entry->object.sub_map;
              }
      }
      
      #endif /* DDB || DEBUGPRINT */
      
      #ifndef __USER_VA0_IS_SAFE
      static int
      sysctl_user_va0_disable(SYSCTLFN_ARGS)
      {
              struct sysctlnode node;
              int t, error;
      
              node = *rnode;
              node.sysctl_data = &t;
              t = user_va0_disable;
              error = sysctl_lookup(SYSCTLFN_CALL(&node));
              if (error || newp == NULL)
                      return (error);
      
              if (!t && user_va0_disable &&
                  kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MAP_VA_ZERO, 0,
                  NULL, NULL, NULL))
                      return EPERM;
      
              user_va0_disable = !!t;
              return 0;
      }
      #endif
      
      static int
      fill_vmentry(struct lwp *l, struct proc *p, struct kinfo_vmentry *kve,
          struct vm_map *m, struct vm_map_entry *e)
      {
      #ifndef _RUMPKERNEL
              int error;
      
              memset(kve, 0, sizeof(*kve));
              KASSERT(e != NULL);
              if (UVM_ET_ISOBJ(e)) {
                      struct uvm_object *uobj = e->object.uvm_obj;
                      KASSERT(uobj != NULL);
                      kve->kve_ref_count = uobj->uo_refs;
                      kve->kve_count = uobj->uo_npages;
                      if (UVM_OBJ_IS_VNODE(uobj)) {
                              struct vattr va;
                              struct vnode *vp = (struct vnode *)uobj;
                              vn_lock(vp, LK_SHARED | LK_RETRY);
                              error = VOP_GETATTR(vp, &va, l->l_cred);
                              VOP_UNLOCK(vp);
                              kve->kve_type = KVME_TYPE_VNODE;
                              if (error == 0) {
                                      kve->kve_vn_size = vp->v_size;
                                      kve->kve_vn_type = (int)vp->v_type;
                                      kve->kve_vn_mode = va.va_mode;
                                      kve->kve_vn_rdev = va.va_rdev;
                                      kve->kve_vn_fileid = va.va_fileid;
                                      kve->kve_vn_fsid = va.va_fsid;
                                      error = vnode_to_path(kve->kve_path,
                                          sizeof(kve->kve_path) / 2, vp, l, p);
      #ifdef DIAGNOSTIC
                                      if (error)
                                              printf("%s: vp %p error %d\n", __func__,
                                                      vp, error);
      #endif
                              }
                      } else if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
                              kve->kve_type = KVME_TYPE_KERN;
                      } else if (UVM_OBJ_IS_DEVICE(uobj)) {
                              kve->kve_type = KVME_TYPE_DEVICE;
                      } else if (UVM_OBJ_IS_AOBJ(uobj)) {
                              kve->kve_type = KVME_TYPE_ANON;
                      } else {
                              kve->kve_type = KVME_TYPE_OBJECT;
                      }
              } else if (UVM_ET_ISSUBMAP(e)) {
                      struct vm_map *map = e->object.sub_map;
                      KASSERT(map != NULL);
                      kve->kve_ref_count = map->ref_count;
                      kve->kve_count = map->nentries;
                      kve->kve_type = KVME_TYPE_SUBMAP;
              } else
                      kve->kve_type = KVME_TYPE_UNKNOWN;
      
              kve->kve_start = e->start;
              kve->kve_end = e->end;
              kve->kve_offset = e->offset;
              kve->kve_wired_count = e->wired_count;
              kve->kve_inheritance = e->inheritance;
              kve->kve_attributes = 0; /* unused */
              kve->kve_advice = e->advice;
      #define PROT(p) (((p) & VM_PROT_READ) ? KVME_PROT_READ : 0) | \
              (((p) & VM_PROT_WRITE) ? KVME_PROT_WRITE : 0) | \
              (((p) & VM_PROT_EXECUTE) ? KVME_PROT_EXEC : 0)
              kve->kve_protection = PROT(e->protection);
              kve->kve_max_protection = PROT(e->max_protection);
              kve->kve_flags |= (e->etype & UVM_ET_COPYONWRITE)
                  ? KVME_FLAG_COW : 0;
              kve->kve_flags |= (e->etype & UVM_ET_NEEDSCOPY)
                  ? KVME_FLAG_NEEDS_COPY : 0;
              kve->kve_flags |= (m->flags & VM_MAP_TOPDOWN)
                  ? KVME_FLAG_GROWS_DOWN : KVME_FLAG_GROWS_UP;
              kve->kve_flags |= (m->flags & VM_MAP_PAGEABLE)
                  ? KVME_FLAG_PAGEABLE : 0;
      #endif
              return 0;
      }
      
      static int
      fill_vmentries(struct lwp *l, pid_t pid, u_int elem_size, void *oldp,
          size_t *oldlenp)
      {
              int error;
              struct proc *p;
              struct kinfo_vmentry *vme;
              struct vmspace *vm;
              struct vm_map *map;
              struct vm_map_entry *entry;
              char *dp;
              size_t count, vmesize;
      
              if (elem_size == 0 || elem_size > 2 * sizeof(*vme))
                      return EINVAL;
      
              if (oldp) {
                      if (*oldlenp > 10UL * 1024UL * 1024UL)
                              return E2BIG;
                      count = *oldlenp / elem_size;
                      if (count == 0)
                              return ENOMEM;
                      vmesize = count * sizeof(*vme);
              } else
                      vmesize = 0;
      
              if ((error = proc_find_locked(l, &p, pid)) != 0)
                      return error;
      
              vme = NULL;
              count = 0;
      
              if ((error = proc_vmspace_getref(p, &vm)) != 0)
                      goto out;
      
              map = &vm->vm_map;
              vm_map_lock_read(map);
      
              dp = oldp;
              if (oldp)
                      vme = kmem_alloc(vmesize, KM_SLEEP);
              for (entry = map->header.next; entry != &map->header;
                  entry = entry->next) {
                      if (oldp && (dp - (char *)oldp) < vmesize) {
                              error = fill_vmentry(l, p, &vme[count], map, entry);
                              if (error)
                                      goto out;
                              dp += elem_size;
                      }
                      count++;
              }
              vm_map_unlock_read(map);
              uvmspace_free(vm);
      
      out:
              if (pid != -1)
                      mutex_exit(p->p_lock);
              if (error == 0) {
                      const u_int esize = uimin(sizeof(*vme), elem_size);
                      dp = oldp;
                      for (size_t i = 0; i < count; i++) {
                              if (oldp && (dp - (char *)oldp) < vmesize) {
                                      error = sysctl_copyout(l, &vme[i], dp, esize);
                                      if (error)
                                              break;
                                      dp += elem_size;
                              } else
                                      break;
                      }
                      count *= elem_size;
                      if (oldp != NULL && *oldlenp < count)
                              error = ENOSPC;
                      *oldlenp = count;
              }
              if (vme)
                      kmem_free(vme, vmesize);
              return error;
      }
      
      static int
      sysctl_vmproc(SYSCTLFN_ARGS)
      {
              int error;
      
              if (namelen == 1 && name[0] == CTL_QUERY)
                      return (sysctl_query(SYSCTLFN_CALL(rnode)));
      
              if (namelen == 0)
                      return EINVAL;
      
              switch (name[0]) {
              case VM_PROC_MAP:
                      if (namelen != 3)
                              return EINVAL;
                      sysctl_unlock();
                      error = fill_vmentries(l, name[1], name[2], oldp, oldlenp);
                      sysctl_relock();
                      return error;
              default:
                      return EINVAL;
              }
      }
      
      SYSCTL_SETUP(sysctl_uvmmap_setup, "sysctl uvmmap setup")
      {
      
              sysctl_createv(clog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_STRUCT, "proc",
                             SYSCTL_DESCR("Process vm information"),
                             sysctl_vmproc, 0, NULL, 0,
                             CTL_VM, VM_PROC, CTL_EOL);
      #ifndef __USER_VA0_IS_SAFE
              sysctl_createv(clog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                             CTLTYPE_INT, "user_va0_disable",
                             SYSCTL_DESCR("Disable VA 0"),
                             sysctl_user_va0_disable, 0, &user_va0_disable, 0,
                             CTL_VM, CTL_CREATE, CTL_EOL);
      #endif
      }
      /* $NetBSD: kern_fileassoc.c,v 1.36 2014/07/10 15:00:28 christos Exp $ */
      
      /*-
       * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. The name of the author may not be used to endorse or promote products
       *    derived from this software without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: kern_fileassoc.c,v 1.36 2014/07/10 15:00:28 christos Exp $");
      
      #include "opt_fileassoc.h"
      
      #include <sys/param.h>
      #include <sys/mount.h>
      #include <sys/queue.h>
      #include <sys/vnode.h>
      #include <sys/errno.h>
      #include <sys/fileassoc.h>
      #include <sys/specificdata.h>
      #include <sys/hash.h>
      #include <sys/kmem.h>
      #include <sys/once.h>
      
      #define        FILEASSOC_INITIAL_TABLESIZE        128
      
      static specificdata_domain_t fileassoc_domain = NULL;
      static specificdata_key_t fileassoc_mountspecific_key;
      static ONCE_DECL(control);
      
      /*
       * Assoc entry.
       * Includes the assoc name for identification and private clear callback.
       */
      struct fileassoc {
              LIST_ENTRY(fileassoc) assoc_list;
              const char *assoc_name;                                /* Name. */
              fileassoc_cleanup_cb_t assoc_cleanup_cb;        /* Clear callback. */
              specificdata_key_t assoc_key;
      };
      
      static LIST_HEAD(, fileassoc) fileassoc_list;
      
      /* An entry in the per-mount hash table. */
      struct fileassoc_file {
              fhandle_t *faf_handle;                                /* File handle */
              specificdata_reference faf_data;                /* Assoc data. */
              u_int faf_nassocs;                                /* # of assocs. */
              LIST_ENTRY(fileassoc_file) faf_list;                /* List pointer. */
      };
      
      LIST_HEAD(fileassoc_hash_entry, fileassoc_file);
      
      struct fileassoc_table {
              struct fileassoc_hash_entry *tbl_hash;
              u_long tbl_mask;                                /* Hash table mask. */
              size_t tbl_nslots;                                /* Number of slots. */
              size_t tbl_nused;                                /* # of used slots. */
              specificdata_reference tbl_data;
      };
      
      /*
       * Hashing function: Takes a number modulus the mask to give back an
       * index into the hash table.
       */
      #define FILEASSOC_HASH(tbl, handle)        \
              (hash32_buf((handle), FHANDLE_SIZE(handle), HASH32_BUF_INIT) \
               & ((tbl)->tbl_mask))
      
      static void *
      file_getdata(struct fileassoc_file *faf, const struct fileassoc *assoc)
      {
      
              return specificdata_getspecific(fileassoc_domain, &faf->faf_data,
                  assoc->assoc_key);
      }
      
      static void
      file_setdata(struct fileassoc_file *faf, const struct fileassoc *assoc,
          void *data)
      {
      
              specificdata_setspecific(fileassoc_domain, &faf->faf_data,
                  assoc->assoc_key, data);
      }
      
      static void
      file_cleanup(struct fileassoc_file *faf, const struct fileassoc *assoc)
      {
              fileassoc_cleanup_cb_t cb;
              void *data;
      
              cb = assoc->assoc_cleanup_cb;
              if (cb == NULL) {
                      return;
              }
              data = file_getdata(faf, assoc);
              (*cb)(data);
      }
      
      static void
      file_free(struct fileassoc_file *faf)
      {
              struct fileassoc *assoc;
      
              LIST_REMOVE(faf, faf_list);
      
              LIST_FOREACH(assoc, &fileassoc_list, assoc_list) {
                      file_cleanup(faf, assoc);
              }
              vfs_composefh_free(faf->faf_handle);
              specificdata_fini(fileassoc_domain, &faf->faf_data);
              kmem_free(faf, sizeof(*faf));
      }
      
      static void
      table_dtor(void *v)
      {
              struct fileassoc_table *tbl = v;
              u_long i;
      
              /* Remove all entries from the table and lists */
              for (i = 0; i < tbl->tbl_nslots; i++) {
                      struct fileassoc_file *faf;
      
                      while ((faf = LIST_FIRST(&tbl->tbl_hash[i])) != NULL) {
                              file_free(faf);
                      }
              }
      
              /* Remove hash table and sysctl node */
              hashdone(tbl->tbl_hash, HASH_LIST, tbl->tbl_mask);
              specificdata_fini(fileassoc_domain, &tbl->tbl_data);
              kmem_free(tbl, sizeof(*tbl));
      }
      
      /*
       * Initialize the fileassoc subsystem.
       */
      static int
      fileassoc_init(void)
      {
              int error;
      
              error = mount_specific_key_create(&fileassoc_mountspecific_key,
                  table_dtor);
              if (error) {
                      return error;
              }
              fileassoc_domain = specificdata_domain_create();
      
              return 0;
      }
      
      /*
       * Register a new assoc.
       */
      int
      fileassoc_register(const char *name, fileassoc_cleanup_cb_t cleanup_cb,
          fileassoc_t *result)
      {
              int error;
              specificdata_key_t key;
              struct fileassoc *assoc;
      
              error = RUN_ONCE(&control, fileassoc_init);
              if (error) {
                      return error;
              }
              error = specificdata_key_create(fileassoc_domain, &key, NULL);
              if (error) {
                      return error;
              }
              assoc = kmem_alloc(sizeof(*assoc), KM_SLEEP);
              assoc->assoc_name = name;
              assoc->assoc_cleanup_cb = cleanup_cb;
              assoc->assoc_key = key;
      
              LIST_INSERT_HEAD(&fileassoc_list, assoc, assoc_list);
      
              *result = assoc;
      
              return 0;
      }
      
      /*
       * Deregister an assoc.
       */
      int
      fileassoc_deregister(fileassoc_t assoc)
      {
      
              LIST_REMOVE(assoc, assoc_list);
              specificdata_key_delete(fileassoc_domain, assoc->assoc_key);
              kmem_free(assoc, sizeof(*assoc));
      
              return 0;
      }
      
      /*
       * Get the hash table for the specified device.
       */
      static struct fileassoc_table *
      fileassoc_table_lookup(struct mount *mp)
      {
              int error;
      
    4         error = RUN_ONCE(&control, fileassoc_init);
    4         if (error) {
                      return NULL;
              }
    4         return mount_getspecific(mp, fileassoc_mountspecific_key);
      }
      
      /*
       * Perform a lookup on a hash table.  If hint is non-zero then use the value
       * of the hint as the identifier instead of performing a lookup for the
       * fileid.
       */
      static struct fileassoc_file *
      fileassoc_file_lookup(struct vnode *vp, fhandle_t *hint)
      {
              struct fileassoc_table *tbl;
              struct fileassoc_hash_entry *hash_entry;
              struct fileassoc_file *faf;
              size_t indx;
    4         fhandle_t *th;
              int error;
      
              tbl = fileassoc_table_lookup(vp->v_mount);
              if (tbl == NULL) {
                      return NULL;
              }
      
              if (hint == NULL) {
                      error = vfs_composefh_alloc(vp, &th);
                      if (error)
                              return (NULL);
              } else {
                      th = hint;
              }
      
              indx = FILEASSOC_HASH(tbl, th);
              hash_entry = &(tbl->tbl_hash[indx]);
      
              LIST_FOREACH(faf, hash_entry, faf_list) {
                      if (((FHANDLE_FILEID(faf->faf_handle)->fid_len ==
                           FHANDLE_FILEID(th)->fid_len)) &&
                          (memcmp(FHANDLE_FILEID(faf->faf_handle), FHANDLE_FILEID(th),
                                 (FHANDLE_FILEID(th))->fid_len) == 0)) {
                              break;
                      }
              }
      
    4         if (hint == NULL)
                      vfs_composefh_free(th);
      
              return faf;
      }
      
      /*
       * Return assoc data associated with a vnode.
       */
      void *
      fileassoc_lookup(struct vnode *vp, fileassoc_t assoc)
      {
              struct fileassoc_file *faf;
      
              faf = fileassoc_file_lookup(vp, NULL);
              if (faf == NULL)
                      return (NULL);
      
              return file_getdata(faf, assoc);
      }
      
      static struct fileassoc_table *
      fileassoc_table_resize(struct fileassoc_table *tbl)
      {
              struct fileassoc_table *newtbl;
              u_long i;
      
              /*
               * Allocate a new table. Like the condition in fileassoc_file_add(),
               * this is also temporary -- just double the number of slots.
               */
              newtbl = kmem_zalloc(sizeof(*newtbl), KM_SLEEP);
              newtbl->tbl_nslots = (tbl->tbl_nslots * 2);
              if (newtbl->tbl_nslots < tbl->tbl_nslots)
                      newtbl->tbl_nslots = tbl->tbl_nslots;
              newtbl->tbl_hash = hashinit(newtbl->tbl_nslots, HASH_LIST,
                  true, &newtbl->tbl_mask);
              newtbl->tbl_nused = 0;
              specificdata_init(fileassoc_domain, &newtbl->tbl_data);
      
              /* XXX we need to make sure nothing uses fileassoc here! */
      
              for (i = 0; i < tbl->tbl_nslots; i++) {
                      struct fileassoc_file *faf;
      
                      while ((faf = LIST_FIRST(&tbl->tbl_hash[i])) != NULL) {
                              struct fileassoc_hash_entry *hash_entry;
                              size_t indx;
      
                              LIST_REMOVE(faf, faf_list);
      
                              indx = FILEASSOC_HASH(newtbl, faf->faf_handle);
                              hash_entry = &(newtbl->tbl_hash[indx]);
      
                              LIST_INSERT_HEAD(hash_entry, faf, faf_list);
      
                              newtbl->tbl_nused++;
                      }
              }
      
              if (tbl->tbl_nused != newtbl->tbl_nused)
                      panic("fileassoc_table_resize: inconsistency detected! "
                          "needed %zu entries, got %zu", tbl->tbl_nused,
                          newtbl->tbl_nused);
      
              hashdone(tbl->tbl_hash, HASH_LIST, tbl->tbl_mask);
              specificdata_fini(fileassoc_domain, &tbl->tbl_data);
              kmem_free(tbl, sizeof(*tbl));
      
              return (newtbl);
      }
      
      /*
       * Create a new fileassoc table.
       */
      static struct fileassoc_table *
      fileassoc_table_add(struct mount *mp)
      {
              struct fileassoc_table *tbl;
      
              /* Check for existing table for device. */
              tbl = fileassoc_table_lookup(mp);
              if (tbl != NULL)
                      return (tbl);
      
              /* Allocate and initialize a table. */
              tbl = kmem_zalloc(sizeof(*tbl), KM_SLEEP);
              tbl->tbl_nslots = FILEASSOC_INITIAL_TABLESIZE;
              tbl->tbl_hash = hashinit(tbl->tbl_nslots, HASH_LIST, true,
                  &tbl->tbl_mask);
              tbl->tbl_nused = 0;
              specificdata_init(fileassoc_domain, &tbl->tbl_data);
      
              mount_setspecific(mp, fileassoc_mountspecific_key, tbl);
      
              return (tbl);
      }
      
      /*
       * Delete a table.
       */
      int
      fileassoc_table_delete(struct mount *mp)
      {
              struct fileassoc_table *tbl;
      
              tbl = fileassoc_table_lookup(mp);
              if (tbl == NULL)
                      return (EEXIST);
      
              mount_setspecific(mp, fileassoc_mountspecific_key, NULL);
              table_dtor(tbl);
      
              return (0);
      }
      
      /*
       * Run a callback for each assoc in a table.
       */
      int
      fileassoc_table_run(struct mount *mp, fileassoc_t assoc, fileassoc_cb_t cb,
          void *cookie)
      {
              struct fileassoc_table *tbl;
              u_long i;
      
              tbl = fileassoc_table_lookup(mp);
              if (tbl == NULL)
                      return (EEXIST);
      
              for (i = 0; i < tbl->tbl_nslots; i++) {
                      struct fileassoc_file *faf;
      
                      LIST_FOREACH(faf, &tbl->tbl_hash[i], faf_list) {
                              void *data;
      
                              data = file_getdata(faf, assoc);
                              if (data != NULL)
                                      cb(data, cookie);
                      }
              }
      
              return (0);
      }
      
      /*
       * Clear a table for a given assoc.
       */
      int
      fileassoc_table_clear(struct mount *mp, fileassoc_t assoc)
      {
              struct fileassoc_table *tbl;
              u_long i;
      
              tbl = fileassoc_table_lookup(mp);
              if (tbl == NULL)
                      return (EEXIST);
      
              for (i = 0; i < tbl->tbl_nslots; i++) {
                      struct fileassoc_file *faf;
      
                      LIST_FOREACH(faf, &tbl->tbl_hash[i], faf_list) {
                              file_cleanup(faf, assoc);
                              file_setdata(faf, assoc, NULL);
                      }
              }
      
              return (0);
      }
      
      /*
       * Add a file entry to a table.
       */
      static struct fileassoc_file *
      fileassoc_file_add(struct vnode *vp, fhandle_t *hint)
      {
              struct fileassoc_table *tbl;
              struct fileassoc_hash_entry *hash_entry;
              struct fileassoc_file *faf;
              size_t indx;
              fhandle_t *th;
              int error;
      
              if (hint == NULL) {
                      error = vfs_composefh_alloc(vp, &th);
                      if (error)
                              return (NULL);
              } else
                      th = hint;
      
              faf = fileassoc_file_lookup(vp, th);
              if (faf != NULL) {
                      if (hint == NULL)
                              vfs_composefh_free(th);
      
                      return (faf);
              }
      
              tbl = fileassoc_table_lookup(vp->v_mount);
              if (tbl == NULL) {
                      tbl = fileassoc_table_add(vp->v_mount);
              }
      
              indx = FILEASSOC_HASH(tbl, th);
              hash_entry = &(tbl->tbl_hash[indx]);
      
              faf = kmem_zalloc(sizeof(*faf), KM_SLEEP);
              faf->faf_handle = th;
              specificdata_init(fileassoc_domain, &faf->faf_data);
              LIST_INSERT_HEAD(hash_entry, faf, faf_list);
      
              /*
               * This decides when we need to resize the table. For now,
               * resize it whenever we "filled" up the number of slots it
               * has. That's not really true unless of course we had zero
               * collisions. Think positive! :)
               */
              if (++(tbl->tbl_nused) == tbl->tbl_nslots) { 
                      struct fileassoc_table *newtbl;
      
                      newtbl = fileassoc_table_resize(tbl);
                      mount_setspecific(vp->v_mount, fileassoc_mountspecific_key,
                          newtbl);
              }
      
              return (faf);
      }
      
      /*
       * Delete a file entry from a table.
       */
      int
      fileassoc_file_delete(struct vnode *vp)
      {
              struct fileassoc_table *tbl;
              struct fileassoc_file *faf;
      
              /* Pre-check if fileassoc is used. XXX */
    4         if (!fileassoc_domain) {
                      return ENOENT;
              }
    4         KERNEL_LOCK(1, NULL);
      
              faf = fileassoc_file_lookup(vp, NULL);
              if (faf == NULL) {
    4                 KERNEL_UNLOCK_ONE(NULL);
                      return (ENOENT);
              }
      
              file_free(faf);
      
              tbl = fileassoc_table_lookup(vp->v_mount);
              KASSERT(tbl != NULL);
              --(tbl->tbl_nused); /* XXX gc? */
      
              KERNEL_UNLOCK_ONE(NULL);
      
    4         return (0);
      }
      
      /*
       * Add an assoc to a vnode.
       */
      int
      fileassoc_add(struct vnode *vp, fileassoc_t assoc, void *data)
      {
              struct fileassoc_file *faf;
              void *olddata;
      
              faf = fileassoc_file_lookup(vp, NULL);
              if (faf == NULL) {
                      faf = fileassoc_file_add(vp, NULL);
                      if (faf == NULL)
                              return (ENOTDIR);
              }
      
              olddata = file_getdata(faf, assoc);
              if (olddata != NULL)
                      return (EEXIST);
      
              file_setdata(faf, assoc, data);
      
              faf->faf_nassocs++;
      
              return (0);
      }
      
      /*
       * Clear an assoc from a vnode.
       */
      int
      fileassoc_clear(struct vnode *vp, fileassoc_t assoc)
      {
              struct fileassoc_file *faf;
      
              faf = fileassoc_file_lookup(vp, NULL);
              if (faf == NULL)
                      return (ENOENT);
      
              file_cleanup(faf, assoc);
              file_setdata(faf, assoc, NULL);
      
              --(faf->faf_nassocs); /* XXX gc? */
      
              return (0);
      }
      /*        $NetBSD: pfil.c,v 1.35 2017/03/10 07:35:58 ryo Exp $        */
      
      /*
       * Copyright (c) 2013 Mindaugas Rasiukevicius <rmind at NetBSD org>
       * Copyright (c) 1996 Matthew R. Green
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
       * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
       * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
       * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
       * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: pfil.c,v 1.35 2017/03/10 07:35:58 ryo Exp $");
      
      #if defined(_KERNEL_OPT)
      #include "opt_net_mpsafe.h"
      #endif
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/queue.h>
      #include <sys/kmem.h>
      #include <sys/psref.h>
      
      #include <net/if.h>
      #include <net/pfil.h>
      
      #define        MAX_HOOKS        8
      
      /* Func is either pfil_func_t or pfil_ifunc_t. */
      typedef void                (*pfil_polyfunc_t)(void);
      
      typedef struct {
              pfil_polyfunc_t pfil_func;
              void *                pfil_arg;
      } pfil_hook_t;
      
      typedef struct {
              pfil_hook_t        hooks[MAX_HOOKS];
              u_int                nhooks;
              struct psref_target psref;
      } pfil_list_t;
      
      typedef struct {
              pfil_list_t        *active;        /* lists[0] or lists[1] */
              pfil_list_t        lists[2];
      } pfil_listset_t;
      
      CTASSERT(PFIL_IN == 1);
      CTASSERT(PFIL_OUT == 2);
      
      struct pfil_head {
              pfil_listset_t        ph_in;
              pfil_listset_t        ph_out;
              pfil_listset_t        ph_ifaddr;
              pfil_listset_t        ph_ifevent;
              int                ph_type;
              void *                ph_key;
              LIST_ENTRY(pfil_head) ph_list;
      };
      
      static const int pfil_flag_cases[] = {
              PFIL_IN, PFIL_OUT
      };
      
      static LIST_HEAD(, pfil_head) pfil_head_list __read_mostly =
          LIST_HEAD_INITIALIZER(&pfil_head_list);
      
      static kmutex_t pfil_mtx __cacheline_aligned;
      static struct psref_class *pfil_psref_class __read_mostly;
      static pserialize_t pfil_psz;
      
      void
      pfil_init(void)
      {
              mutex_init(&pfil_mtx, MUTEX_DEFAULT, IPL_NONE);
              pfil_psz = pserialize_create();
              pfil_psref_class = psref_class_create("pfil", IPL_SOFTNET);
      }
      
      static inline void
      pfil_listset_init(pfil_listset_t *pflistset)
      {
              pflistset->active = &pflistset->lists[0];
              psref_target_init(&pflistset->active->psref, pfil_psref_class);
      }
      
      /*
       * pfil_head_create: create and register a packet filter head.
       */
      pfil_head_t *
      pfil_head_create(int type, void *key)
      {
              pfil_head_t *ph;
      
              if (pfil_head_get(type, key)) {
                      return NULL;
              }
              ph = kmem_zalloc(sizeof(pfil_head_t), KM_SLEEP);
              ph->ph_type = type;
              ph->ph_key = key;
      
              pfil_listset_init(&ph->ph_in);
              pfil_listset_init(&ph->ph_out);
              pfil_listset_init(&ph->ph_ifaddr);
              pfil_listset_init(&ph->ph_ifevent);
      
              LIST_INSERT_HEAD(&pfil_head_list, ph, ph_list);
              return ph;
      }
      
      /*
       * pfil_head_destroy: remove and destroy a packet filter head.
       */
      void
      pfil_head_destroy(pfil_head_t *pfh)
      {
              LIST_REMOVE(pfh, ph_list);
      
              psref_target_destroy(&pfh->ph_in.active->psref, pfil_psref_class);
              psref_target_destroy(&pfh->ph_out.active->psref, pfil_psref_class);
              psref_target_destroy(&pfh->ph_ifaddr.active->psref, pfil_psref_class);
              psref_target_destroy(&pfh->ph_ifevent.active->psref, pfil_psref_class);
      
              kmem_free(pfh, sizeof(pfil_head_t));
      }
      
      /*
       * pfil_head_get: returns the packer filter head for a given key.
       */
      pfil_head_t *
      pfil_head_get(int type, void *key)
      {
              pfil_head_t *ph;
      
              LIST_FOREACH(ph, &pfil_head_list, ph_list) {
                      if (ph->ph_type == type && ph->ph_key == key)
                              break;
              }
              return ph;
      }
      
      static pfil_listset_t *
      pfil_hook_get(int dir, pfil_head_t *ph)
      {
   58         switch (dir) {
              case PFIL_IN:
                      return &ph->ph_in;
              case PFIL_OUT:
   58                 return &ph->ph_out;
              case PFIL_IFADDR:
                      return &ph->ph_ifaddr;
              case PFIL_IFNET:
                      return &ph->ph_ifevent;
              }
              return NULL;
      }
      
      static int
      pfil_list_add(pfil_listset_t *phlistset, pfil_polyfunc_t func, void *arg,
                    int flags)
      {
              u_int nhooks;
              pfil_list_t *newlist, *oldlist;
              pfil_hook_t *pfh;
      
              mutex_enter(&pfil_mtx);
      
              /* Check if we have a free slot. */
              nhooks = phlistset->active->nhooks;
              if (nhooks == MAX_HOOKS) {
                      mutex_exit(&pfil_mtx);
                      return ENOSPC;
              }
              KASSERT(nhooks < MAX_HOOKS);
      
              if (phlistset->active == &phlistset->lists[0]) {
                      oldlist = &phlistset->lists[0];
                      newlist = &phlistset->lists[1];
              } else{
                      oldlist = &phlistset->lists[1];
                      newlist = &phlistset->lists[0];
              }
      
              /* Make sure the hook is not already added. */
              for (u_int i = 0; i < nhooks; i++) {
                      pfh = &oldlist->hooks[i];
                      if (pfh->pfil_func == func && pfh->pfil_arg == arg) {
                              mutex_exit(&pfil_mtx);
                              return EEXIST;
                      }
              }
      
              /* create new pfil_list_t copied from old */
              memcpy(newlist, oldlist, sizeof(pfil_list_t));
              psref_target_init(&newlist->psref, pfil_psref_class);
      
              /*
               * Finally, add the hook.  Note: for PFIL_IN we insert the hooks in
               * reverse order of the PFIL_OUT so that the same path is followed
               * in or out of the kernel.
               */
              if (flags & PFIL_IN) {
                      /* XXX: May want to revisit this later; */
                      size_t len = sizeof(pfil_hook_t) * nhooks;
                      pfh = &newlist->hooks[0];
                      memmove(&newlist->hooks[1], pfh, len);
              } else {
                      pfh = &newlist->hooks[nhooks];
              }
              newlist->nhooks++;
      
              pfh->pfil_func = func;
              pfh->pfil_arg  = arg;
      
              /* switch from oldlist to newlist */
              membar_producer();
              phlistset->active = newlist;
      #ifdef NET_MPSAFE
              pserialize_perform(pfil_psz);
      #endif
              mutex_exit(&pfil_mtx);
      
              /* Wait for all readers */
      #ifdef NET_MPSAFE
              psref_target_destroy(&oldlist->psref, pfil_psref_class);
      #endif
      
              return 0;
      }
      
      /*
       * pfil_add_hook: add a function (hook) to the packet filter head.
       * The possible flags are:
       *
       *        PFIL_IN                call on incoming packets
       *        PFIL_OUT        call on outgoing packets
       *        PFIL_ALL        call on all of the above
       */
      int
      pfil_add_hook(pfil_func_t func, void *arg, int flags, pfil_head_t *ph)
      {
              int error = 0;
      
              KASSERT(func != NULL);
              KASSERT((flags & ~PFIL_ALL) == 0);
      
              for (u_int i = 0; i < __arraycount(pfil_flag_cases); i++) {
                      const int fcase = pfil_flag_cases[i];
                      pfil_listset_t *phlistset;
      
                      if ((flags & fcase) == 0) {
                              continue;
                      }
                      phlistset = pfil_hook_get(fcase, ph);
                      error = pfil_list_add(phlistset, (pfil_polyfunc_t)func, arg,
                          flags);
                      if (error && (error != EEXIST))
                              break;
              }
              if (error && (error != EEXIST)) {
                      pfil_remove_hook(func, arg, flags, ph);
              }
              return error;
      }
      
      /*
       * pfil_add_ihook: add an interface-event function (hook) to the packet
       * filter head.  The possible flags are:
       *
       *        PFIL_IFADDR        call on interface reconfig (cmd is ioctl #)
       *        PFIL_IFNET        call on interface attach/detach (cmd is PFIL_IFNET_*)
       */
      int
      pfil_add_ihook(pfil_ifunc_t func, void *arg, int flags, pfil_head_t *ph)
      {
              pfil_listset_t *phlistset;
      
              KASSERT(func != NULL);
              KASSERT(flags == PFIL_IFADDR || flags == PFIL_IFNET);
      
              phlistset = pfil_hook_get(flags, ph);
              return pfil_list_add(phlistset, (pfil_polyfunc_t)func, arg, flags);
      }
      
      /*
       * pfil_list_remove: remove the hook from a specified list.
       */
      static int
      pfil_list_remove(pfil_listset_t *phlistset, pfil_polyfunc_t func, void *arg)
      {
              u_int nhooks;
              pfil_list_t *oldlist, *newlist;
      
              mutex_enter(&pfil_mtx);
      
              /* create new pfil_list_t copied from old */
              if (phlistset->active == &phlistset->lists[0]) {
                      oldlist = &phlistset->lists[0];
                      newlist = &phlistset->lists[1];
              } else{
                      oldlist = &phlistset->lists[1];
                      newlist = &phlistset->lists[0];
              }
              memcpy(newlist, oldlist, sizeof(*newlist));
              psref_target_init(&newlist->psref, pfil_psref_class);
      
              nhooks = newlist->nhooks;
              for (u_int i = 0; i < nhooks; i++) {
                      pfil_hook_t *last, *pfh = &newlist->hooks[i];
      
                      if (pfh->pfil_func != func || pfh->pfil_arg != arg) {
                              continue;
                      }
                      if ((last = &newlist->hooks[nhooks - 1]) != pfh) {
                              memcpy(pfh, last, sizeof(pfil_hook_t));
                      }
                      newlist->nhooks--;
      
                      /* switch from oldlist to newlist */
                      phlistset->active = newlist;
                      membar_producer();
      #ifdef NET_MPSAFE
                      pserialize_perform(pfil_psz);
      #endif
                      mutex_exit(&pfil_mtx);
      
                      /* Wait for all readers */
      #ifdef NET_MPSAFE
                      psref_target_destroy(&oldlist->psref, pfil_psref_class);
      #endif
      
                      return 0;
              }
              mutex_exit(&pfil_mtx);
              return ENOENT;
      }
      
      /*
       * pfil_remove_hook: remove the hook from the packet filter head.
       */
      int
      pfil_remove_hook(pfil_func_t func, void *arg, int flags, pfil_head_t *ph)
      {
              KASSERT((flags & ~PFIL_ALL) == 0);
      
              for (u_int i = 0; i < __arraycount(pfil_flag_cases); i++) {
                      const int fcase = pfil_flag_cases[i];
                      pfil_listset_t *pflistset;
      
                      if ((flags & fcase) == 0) {
                              continue;
                      }
                      pflistset = pfil_hook_get(fcase, ph);
                      (void)pfil_list_remove(pflistset, (pfil_polyfunc_t)func, arg);
              }
              return 0;
      }
      
      int
      pfil_remove_ihook(pfil_ifunc_t func, void *arg, int flags, pfil_head_t *ph)
      {
              pfil_listset_t *pflistset;
      
              KASSERT(flags == PFIL_IFADDR || flags == PFIL_IFNET);
              pflistset = pfil_hook_get(flags, ph);
              (void)pfil_list_remove(pflistset, (pfil_polyfunc_t)func, arg);
              return 0;
      }
      
      /*
       * pfil_run_hooks: run the specified packet filter hooks.
       */
      int
      pfil_run_hooks(pfil_head_t *ph, struct mbuf **mp, ifnet_t *ifp, int dir)
      {
   58         struct mbuf *m = mp ? *mp : NULL;
              pfil_listset_t *phlistset;
              pfil_list_t *phlist;
              struct psref psref;
              int s, bound;
              int ret = 0;
      
              KASSERT(dir == PFIL_IN || dir == PFIL_OUT);
   58         if (__predict_false((phlistset = pfil_hook_get(dir, ph)) == NULL)) {
                      return ret;
              }
      
   58         bound = curlwp_bind();
              s = pserialize_read_enter();
              phlist = phlistset->active;
              membar_datadep_consumer();
              psref_acquire(&psref, &phlist->psref, pfil_psref_class);
              pserialize_read_exit(s);
              for (u_int i = 0; i < phlist->nhooks; i++) {
                      pfil_hook_t *pfh = &phlist->hooks[i];
                      pfil_func_t func = (pfil_func_t)pfh->pfil_func;
      
                      ret = (*func)(pfh->pfil_arg, &m, ifp, dir);
                      if (m == NULL || ret)
                              break;
              }
   58         psref_release(&psref, &phlist->psref, pfil_psref_class);
   58         curlwp_bindx(bound);
      
              if (mp) {
   58                 *mp = m;
              }
              return ret;
      }
      
      static void
      pfil_run_arg(pfil_listset_t *phlistset, u_long cmd, void *arg)
      {
              pfil_list_t *phlist;
              struct psref psref;
              int s, bound;
      
              bound = curlwp_bind();
              s = pserialize_read_enter();
              phlist = phlistset->active;
              membar_datadep_consumer();
              psref_acquire(&psref, &phlist->psref, pfil_psref_class);
              pserialize_read_exit(s);
              for (u_int i = 0; i < phlist->nhooks; i++) {
                      pfil_hook_t *pfh = &phlist->hooks[i];
                      pfil_ifunc_t func = (pfil_ifunc_t)pfh->pfil_func;
                      (*func)(pfh->pfil_arg, cmd, arg);
              }
              psref_release(&psref, &phlist->psref, pfil_psref_class);
              curlwp_bindx(bound);
      }
      
      void
      pfil_run_addrhooks(pfil_head_t *ph, u_long cmd, struct ifaddr *ifa)
      {
              pfil_run_arg(&ph->ph_ifaddr, cmd, ifa);
      }
      
      void
      pfil_run_ifhooks(pfil_head_t *ph, u_long cmd, struct ifnet *ifp)
      {
              pfil_run_arg(&ph->ph_ifevent, cmd, ifp);
      }
      /*        $NetBSD: uvm_km.c,v 1.150 2019/12/01 23:14:47 uwe Exp $        */
      
      /*
       * Copyright (c) 1997 Charles D. Cranor and Washington University.
       * Copyright (c) 1991, 1993, The Regents of the University of California.
       *
       * All rights reserved.
       *
       * This code is derived from software contributed to Berkeley by
       * The Mach Operating System project at Carnegie-Mellon University.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)vm_kern.c   8.3 (Berkeley) 1/12/94
       * from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp
       *
       *
       * Copyright (c) 1987, 1990 Carnegie-Mellon University.
       * All rights reserved.
       *
       * Permission to use, copy, modify and distribute this software and
       * its documentation is hereby granted, provided that both the copyright
       * notice and this permission notice appear in all copies of the
       * software, derivative works or modified versions, and any portions
       * thereof, and that both notices appear in supporting documentation.
       *
       * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
       * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
       * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
       *
       * Carnegie Mellon requests users of this software to return to
       *
       *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
       *  School of Computer Science
       *  Carnegie Mellon University
       *  Pittsburgh PA 15213-3890
       *
       * any improvements or extensions that they make and grant Carnegie the
       * rights to redistribute these changes.
       */
      
      /*
       * uvm_km.c: handle kernel memory allocation and management
       */
      
      /*
       * overview of kernel memory management:
       *
       * the kernel virtual address space is mapped by "kernel_map."   kernel_map
       * starts at VM_MIN_KERNEL_ADDRESS and goes to VM_MAX_KERNEL_ADDRESS.
       * note that VM_MIN_KERNEL_ADDRESS is equal to vm_map_min(kernel_map).
       *
       * the kernel_map has several "submaps."   submaps can only appear in
       * the kernel_map (user processes can't use them).   submaps "take over"
       * the management of a sub-range of the kernel's address space.  submaps
       * are typically allocated at boot time and are never released.   kernel
       * virtual address space that is mapped by a submap is locked by the
       * submap's lock -- not the kernel_map's lock.
       *
       * thus, the useful feature of submaps is that they allow us to break
       * up the locking and protection of the kernel address space into smaller
       * chunks.
       *
       * the vm system has several standard kernel submaps/arenas, including:
       *   kmem_arena => used for kmem/pool (memoryallocators(9))
       *   pager_map => used to map "buf" structures into kernel space
       *   exec_map => used during exec to handle exec args
       *   etc...
       *
       * The kmem_arena is a "special submap", as it lives in a fixed map entry
       * within the kernel_map and is controlled by vmem(9).
       *
       * the kernel allocates its private memory out of special uvm_objects whose
       * reference count is set to UVM_OBJ_KERN (thus indicating that the objects
       * are "special" and never die).   all kernel objects should be thought of
       * as large, fixed-sized, sparsely populated uvm_objects.   each kernel
       * object is equal to the size of kernel virtual address space (i.e. the
       * value "VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS").
       *
       * note that just because a kernel object spans the entire kernel virtual
       * address space doesn't mean that it has to be mapped into the entire space.
       * large chunks of a kernel object's space go unused either because
       * that area of kernel VM is unmapped, or there is some other type of
       * object mapped into that range (e.g. a vnode).    for submap's kernel
       * objects, the only part of the object that can ever be populated is the
       * offsets that are managed by the submap.
       *
       * note that the "offset" in a kernel object is always the kernel virtual
       * address minus the VM_MIN_KERNEL_ADDRESS (aka vm_map_min(kernel_map)).
       * example:
       *   suppose VM_MIN_KERNEL_ADDRESS is 0xf8000000 and the kernel does a
       *   uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the
       *   kernel map].    if uvm_km_alloc returns virtual address 0xf8235000,
       *   then that means that the page at offset 0x235000 in kernel_object is
       *   mapped at 0xf8235000.
       *
       * kernel object have one other special property: when the kernel virtual
       * memory mapping them is unmapped, the backing memory in the object is
       * freed right away.   this is done with the uvm_km_pgremove() function.
       * this has to be done because there is no backing store for kernel pages
       * and no need to save them after they are no longer referenced.
       *
       * Generic arenas:
       *
       * kmem_arena:
       *        Main arena controlling the kernel KVA used by other arenas.
       *
       * kmem_va_arena:
       *        Implements quantum caching in order to speedup allocations and
       *        reduce fragmentation.  The pool(9), unless created with a custom
       *        meta-data allocator, and kmem(9) subsystems use this arena.
       *
       * Arenas for meta-data allocations are used by vmem(9) and pool(9).
       * These arenas cannot use quantum cache.  However, kmem_va_meta_arena
       * compensates this by importing larger chunks from kmem_arena.
       *
       * kmem_va_meta_arena:
       *        Space for meta-data.
       *
       * kmem_meta_arena:
       *        Imports from kmem_va_meta_arena.  Allocations from this arena are
       *        backed with the pages.
       *
       * Arena stacking:
       *
       *        kmem_arena
       *                kmem_va_arena
       *                kmem_va_meta_arena
       *                        kmem_meta_arena
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: uvm_km.c,v 1.150 2019/12/01 23:14:47 uwe Exp $");
      
      #include "opt_uvmhist.h"
      
      #include "opt_kmempages.h"
      
      #ifndef NKMEMPAGES
      #define NKMEMPAGES 0
      #endif
      
      /*
       * Defaults for lower and upper-bounds for the kmem_arena page count.
       * Can be overridden by kernel config options.
       */
      #ifndef NKMEMPAGES_MIN
      #define NKMEMPAGES_MIN NKMEMPAGES_MIN_DEFAULT
      #endif
      
      #ifndef NKMEMPAGES_MAX
      #define NKMEMPAGES_MAX NKMEMPAGES_MAX_DEFAULT
      #endif
      
      
      #include <sys/param.h>
      #include <sys/systm.h>
      #include <sys/atomic.h>
      #include <sys/proc.h>
      #include <sys/pool.h>
      #include <sys/vmem.h>
      #include <sys/vmem_impl.h>
      #include <sys/kmem.h>
      #include <sys/msan.h>
      
      #include <uvm/uvm.h>
      
      /*
       * global data structures
       */
      
      struct vm_map *kernel_map = NULL;
      
      /*
       * local data structues
       */
      
      static struct vm_map                kernel_map_store;
      static struct vm_map_entry        kernel_image_mapent_store;
      static struct vm_map_entry        kernel_kmem_mapent_store;
      
      int nkmempages = 0;
      vaddr_t kmembase;
      vsize_t kmemsize;
      
      static struct vmem kmem_arena_store;
      vmem_t *kmem_arena = NULL;
      static struct vmem kmem_va_arena_store;
      vmem_t *kmem_va_arena;
      
      /*
       * kmeminit_nkmempages: calculate the size of kmem_arena.
       */
      void
      kmeminit_nkmempages(void)
      {
              int npages;
      
              if (nkmempages != 0) {
                      /*
                       * It's already been set (by us being here before)
                       * bail out now;
                       */
                      return;
              }
      
      #if defined(KMSAN)
              npages = (physmem / 8);
      #elif defined(PMAP_MAP_POOLPAGE)
              npages = (physmem / 4);
      #else
              npages = (physmem / 3) * 2;
      #endif /* defined(PMAP_MAP_POOLPAGE) */
      
      #ifndef NKMEMPAGES_MAX_UNLIMITED
              if (npages > NKMEMPAGES_MAX)
                      npages = NKMEMPAGES_MAX;
      #endif
      
              if (npages < NKMEMPAGES_MIN)
                      npages = NKMEMPAGES_MIN;
      
              nkmempages = npages;
      }
      
      /*
       * uvm_km_bootstrap: init kernel maps and objects to reflect reality (i.e.
       * KVM already allocated for text, data, bss, and static data structures).
       *
       * => KVM is defined by VM_MIN_KERNEL_ADDRESS/VM_MAX_KERNEL_ADDRESS.
       *    we assume that [vmin -> start] has already been allocated and that
       *    "end" is the end.
       */
      
      void
      uvm_km_bootstrap(vaddr_t start, vaddr_t end)
      {
              bool kmem_arena_small;
              vaddr_t base = VM_MIN_KERNEL_ADDRESS;
              struct uvm_map_args args;
              int error;
      
              UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
              UVMHIST_LOG(maphist, "start=%#jx end=%#jx", start, end, 0,0);
      
              kmeminit_nkmempages();
              kmemsize = (vsize_t)nkmempages * PAGE_SIZE;
              kmem_arena_small = kmemsize < 64 * 1024 * 1024;
      
              UVMHIST_LOG(maphist, "kmemsize=%#jx", kmemsize, 0,0,0);
      
              /*
               * next, init kernel memory objects.
               */
      
              /* kernel_object: for pageable anonymous kernel memory */
              uvm_kernel_object = uao_create(VM_MAX_KERNEL_ADDRESS -
                                      VM_MIN_KERNEL_ADDRESS, UAO_FLAG_KERNOBJ);
      
              /*
               * init the map and reserve any space that might already
               * have been allocated kernel space before installing.
               */
      
              uvm_map_setup(&kernel_map_store, base, end, VM_MAP_PAGEABLE);
              kernel_map_store.pmap = pmap_kernel();
              if (start != base) {
                      error = uvm_map_prepare(&kernel_map_store,
                          base, start - base,
                          NULL, UVM_UNKNOWN_OFFSET, 0,
                          UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
                                          UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
                      if (!error) {
                              kernel_image_mapent_store.flags =
                                  UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
                              error = uvm_map_enter(&kernel_map_store, &args,
                                  &kernel_image_mapent_store);
                      }
      
                      if (error)
                              panic(
                                  "uvm_km_bootstrap: could not reserve space for kernel");
      
                      kmembase = args.uma_start + args.uma_size;
              } else {
                      kmembase = base;
              }
      
              error = uvm_map_prepare(&kernel_map_store,
                  kmembase, kmemsize,
                  NULL, UVM_UNKNOWN_OFFSET, 0,
                  UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
                                  UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
              if (!error) {
                      kernel_kmem_mapent_store.flags =
                          UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
                      error = uvm_map_enter(&kernel_map_store, &args,
                          &kernel_kmem_mapent_store);
              }
      
              if (error)
                      panic("uvm_km_bootstrap: could not reserve kernel kmem");
      
              /*
               * install!
               */
      
              kernel_map = &kernel_map_store;
      
              pool_subsystem_init();
      
              kmem_arena = vmem_init(&kmem_arena_store, "kmem",
                  kmembase, kmemsize, PAGE_SIZE, NULL, NULL, NULL,
                  0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
      #ifdef PMAP_GROWKERNEL
              /*
               * kmem_arena VA allocations happen independently of uvm_map.
               * grow kernel to accommodate the kmem_arena.
               */
              if (uvm_maxkaddr < kmembase + kmemsize) {
                      uvm_maxkaddr = pmap_growkernel(kmembase + kmemsize);
                      KASSERTMSG(uvm_maxkaddr >= kmembase + kmemsize,
                          "%#"PRIxVADDR" %#"PRIxVADDR" %#"PRIxVSIZE,
                          uvm_maxkaddr, kmembase, kmemsize);
              }
      #endif
      
              vmem_subsystem_init(kmem_arena);
      
              UVMHIST_LOG(maphist, "kmem vmem created (base=%#jx, size=%#jx",
                  kmembase, kmemsize, 0,0);
      
              kmem_va_arena = vmem_init(&kmem_va_arena_store, "kva",
                  0, 0, PAGE_SIZE, vmem_alloc, vmem_free, kmem_arena,
                  (kmem_arena_small ? 4 : VMEM_QCACHE_IDX_MAX) * PAGE_SIZE,
                  VM_NOSLEEP, IPL_VM);
      
              UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
      }
      
      /*
       * uvm_km_init: init the kernel maps virtual memory caches
       * and start the pool/kmem allocator.
       */
      void
      uvm_km_init(void)
      {
              kmem_init();
      }
      
      /*
       * uvm_km_suballoc: allocate a submap in the kernel map.   once a submap
       * is allocated all references to that area of VM must go through it.  this
       * allows the locking of VAs in kernel_map to be broken up into regions.
       *
       * => if `fixed' is true, *vmin specifies where the region described
       *   pager_map => used to map "buf" structures into kernel space
       *      by the submap must start
       * => if submap is non NULL we use that as the submap, otherwise we
       *        alloc a new map
       */
      
      struct vm_map *
      uvm_km_suballoc(struct vm_map *map, vaddr_t *vmin /* IN/OUT */,
          vaddr_t *vmax /* OUT */, vsize_t size, int flags, bool fixed,
          struct vm_map *submap)
      {
              int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0);
              UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
      
              KASSERT(vm_map_pmap(map) == pmap_kernel());
      
              size = round_page(size);        /* round up to pagesize */
      
              /*
               * first allocate a blank spot in the parent map
               */
      
              if (uvm_map(map, vmin, size, NULL, UVM_UNKNOWN_OFFSET, 0,
                  UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
                  UVM_ADV_RANDOM, mapflags)) != 0) {
                      panic("%s: unable to allocate space in parent map", __func__);
              }
      
              /*
               * set VM bounds (vmin is filled in by uvm_map)
               */
      
              *vmax = *vmin + size;
      
              /*
               * add references to pmap and create or init the submap
               */
      
              pmap_reference(vm_map_pmap(map));
              if (submap == NULL) {
                      submap = kmem_alloc(sizeof(*submap), KM_SLEEP);
              }
              uvm_map_setup(submap, *vmin, *vmax, flags);
              submap->pmap = vm_map_pmap(map);
      
              /*
               * now let uvm_map_submap plug in it...
               */
      
              if (uvm_map_submap(map, *vmin, *vmax, submap) != 0)
                      panic("uvm_km_suballoc: submap allocation failed");
      
              return(submap);
      }
      
      /*
       * uvm_km_pgremove: remove pages from a kernel uvm_object and KVA.
       */
      
      void
      uvm_km_pgremove(vaddr_t startva, vaddr_t endva)
      {
   30         struct uvm_object * const uobj = uvm_kernel_object;
              const voff_t start = startva - vm_map_min(kernel_map);
              const voff_t end = endva - vm_map_min(kernel_map);
              struct vm_page *pg;
              voff_t curoff, nextoff;
              int swpgonlydelta = 0;
              UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
      
              KASSERT(VM_MIN_KERNEL_ADDRESS <= startva);
   30         KASSERT(startva < endva);
   30         KASSERT(endva <= VM_MAX_KERNEL_ADDRESS);
      
   30         mutex_enter(uobj->vmobjlock);
              pmap_remove(pmap_kernel(), startva, endva);
   30         for (curoff = start; curoff < end; curoff = nextoff) {
   30                 nextoff = curoff + PAGE_SIZE;
                      pg = uvm_pagelookup(uobj, curoff);
    2                 if (pg != NULL && pg->flags & PG_BUSY) {
                              pg->flags |= PG_WANTED;
                              UVM_UNLOCK_AND_WAIT(pg, uobj->vmobjlock, 0,
                                          "km_pgrm", 0);
                              mutex_enter(uobj->vmobjlock);
                              nextoff = curoff;
                              continue;
                      }
      
                      /*
                       * free the swap slot, then the page.
                       */
      
                      if (pg == NULL &&
   30                     uao_find_swslot(uobj, curoff >> PAGE_SHIFT) > 0) {
                              swpgonlydelta++;
                      }
   30                 uao_dropswap(uobj, curoff >> PAGE_SHIFT);
                      if (pg != NULL) {
                              mutex_enter(&uvm_pageqlock);
                              uvm_pagefree(pg);
                              mutex_exit(&uvm_pageqlock);
                      }
              }
   30         mutex_exit(uobj->vmobjlock);
      
              if (swpgonlydelta > 0) {
                      KASSERT(uvmexp.swpgonly >= swpgonlydelta);
                      atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
              }
      }
      
      
      /*
       * uvm_km_pgremove_intrsafe: like uvm_km_pgremove(), but for non object backed
       *    regions.
       *
       * => when you unmap a part of anonymous kernel memory you want to toss
       *    the pages right away.    (this is called from uvm_unmap_...).
       * => none of the pages will ever be busy, and none of them will ever
       *    be on the active or inactive queues (because they have no object).
       */
      
      void
      uvm_km_pgremove_intrsafe(struct vm_map *map, vaddr_t start, vaddr_t end)
      {
      #define __PGRM_BATCH 16
              struct vm_page *pg;
   80         paddr_t pa[__PGRM_BATCH];
              int npgrm, i;
              vaddr_t va, batch_vastart;
      
              UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
      
              KASSERT(VM_MAP_IS_KERNEL(map));
   80         KASSERTMSG(vm_map_min(map) <= start,
                  "vm_map_min(map) [%#"PRIxVADDR"] <= start [%#"PRIxVADDR"]"
                  " (size=%#"PRIxVSIZE")",
                  vm_map_min(map), start, end - start);
   80         KASSERT(start < end);
   80         KASSERT(end <= vm_map_max(map));
      
   80         for (va = start; va < end;) {
                      batch_vastart = va;
                      /* create a batch of at most __PGRM_BATCH pages to free */
                      for (i = 0;
   80                      i < __PGRM_BATCH && va < end;
   80                      va += PAGE_SIZE) {
   80                         if (!pmap_extract(pmap_kernel(), va, &pa[i])) {
                                      continue;
                              }
   50                         i++;
                      }
                      npgrm = i;
                      /* now remove the mappings */
   80                 pmap_kremove(batch_vastart, va - batch_vastart);
                      /* and free the pages */
   80                 for (i = 0; i < npgrm; i++) {
   50                         pg = PHYS_TO_VM_PAGE(pa[i]);
                              KASSERT(pg);
   50                         KASSERT(pg->uobject == NULL && pg->uanon == NULL);
   50                         KASSERT((pg->flags & PG_BUSY) == 0);
   50                         uvm_pagefree(pg);
                      }
              }
      #undef __PGRM_BATCH
      }
      
      #if defined(DEBUG)
      void
      uvm_km_check_empty(struct vm_map *map, vaddr_t start, vaddr_t end)
      {
              struct vm_page *pg;
              vaddr_t va;
  103         paddr_t pa;
              UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
      
              KDASSERT(VM_MAP_IS_KERNEL(map));
  103         KDASSERT(vm_map_min(map) <= start);
  103         KDASSERT(start < end);
  103         KDASSERT(end <= vm_map_max(map));
      
  103         for (va = start; va < end; va += PAGE_SIZE) {
  103                 if (pmap_extract(pmap_kernel(), va, &pa)) {
                              panic("uvm_km_check_empty: va %p has pa 0x%llx",
                                  (void *)va, (long long)pa);
                      }
  103                 mutex_enter(uvm_kernel_object->vmobjlock);
                      pg = uvm_pagelookup(uvm_kernel_object,
                          va - vm_map_min(kernel_map));
                      mutex_exit(uvm_kernel_object->vmobjlock);
                      if (pg) {
                              panic("uvm_km_check_empty: "
                                  "has page hashed at %p", (const void *)va);
                      }
              }
      }
      #endif /* defined(DEBUG) */
      
      /*
       * uvm_km_alloc: allocate an area of kernel memory.
       *
       * => NOTE: we can return 0 even if we can wait if there is not enough
       *        free VM space in the map... caller should be prepared to handle
       *        this case.
       * => we return KVA of memory allocated
       */
      
      vaddr_t
      uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags)
      {
   61         vaddr_t kva, loopva;
              vaddr_t offset;
              vsize_t loopsize;
              struct vm_page *pg;
              struct uvm_object *obj;
              int pgaflags;
              vm_prot_t prot, vaprot;
              UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
      
              KASSERT(vm_map_pmap(map) == pmap_kernel());
   61         KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
                      (flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
                      (flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
   61         KASSERT((flags & UVM_KMF_VAONLY) != 0 || (flags & UVM_KMF_COLORMATCH) == 0);
              KASSERT((flags & UVM_KMF_COLORMATCH) == 0 || (flags & UVM_KMF_VAONLY) != 0);
      
              /*
               * setup for call
               */
      
   61         kva = vm_map_min(map);        /* hint */
              size = round_page(size);
   42         obj = (flags & UVM_KMF_PAGEABLE) ? uvm_kernel_object : NULL;
              UVMHIST_LOG(maphist,"  (map=0x%#jx, obj=0x%#jx, size=0x%jx, flags=%jd)",
                  (uintptr_t)map, (uintptr_t)obj, size, flags);
      
              /*
               * allocate some virtual space
               */
      
   61         vaprot = (flags & UVM_KMF_EXEC) ? UVM_PROT_ALL : UVM_PROT_RW;
   61         if (__predict_false(uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET,
                  align, UVM_MAPFLAG(vaprot, UVM_PROT_ALL, UVM_INH_NONE,
                  UVM_ADV_RANDOM,
                  (flags & (UVM_KMF_TRYLOCK | UVM_KMF_NOWAIT | UVM_KMF_WAITVA
                   | UVM_KMF_COLORMATCH)))) != 0)) {
                      UVMHIST_LOG(maphist, "<- done (no VM)",0,0,0,0);
   61                 return(0);
              }
      
              /*
               * if all we wanted was VA, return now
               */
      
   61         if (flags & (UVM_KMF_VAONLY | UVM_KMF_PAGEABLE)) {
                      UVMHIST_LOG(maphist,"<- done valloc (kva=0x%jx)", kva,0,0,0);
                      return(kva);
              }
      
              /*
               * recover object offset from virtual address
               */
      
   18         offset = kva - vm_map_min(kernel_map);
              UVMHIST_LOG(maphist, "  kva=0x%jx, offset=0x%jx", kva, offset,0,0);
      
              /*
               * now allocate and map in the memory... note that we are the only ones
               * whom should ever get a handle on this area of VM.
               */
      
              loopva = kva;
              loopsize = size;
      
              pgaflags = UVM_FLAG_COLORMATCH;
              if (flags & UVM_KMF_NOWAIT)
                      pgaflags |= UVM_PGA_USERESERVE;
   18         if (flags & UVM_KMF_ZERO)
                      pgaflags |= UVM_PGA_ZERO;
              prot = VM_PROT_READ | VM_PROT_WRITE;
              if (flags & UVM_KMF_EXEC)
                      prot |= VM_PROT_EXECUTE;
   18         while (loopsize) {
   18                 KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, NULL),
                          "loopva=%#"PRIxVADDR, loopva);
      
   18                 pg = uvm_pagealloc_strat(NULL, offset, NULL, pgaflags,
      #ifdef UVM_KM_VMFREELIST
                         UVM_PGA_STRAT_ONLY, UVM_KM_VMFREELIST
      #else
                         UVM_PGA_STRAT_NORMAL, 0
      #endif
                         );
      
                      /*
                       * out of memory?
                       */
      
                      if (__predict_false(pg == NULL)) {
                              if ((flags & UVM_KMF_NOWAIT) ||
                                  ((flags & UVM_KMF_CANFAIL) && !uvm_reclaimable())) {
                                      /* free everything! */
                                      uvm_km_free(map, kva, size,
                                          flags & UVM_KMF_TYPEMASK);
                                      return (0);
                              } else {
                                      uvm_wait("km_getwait2");        /* sleep here */
                                      continue;
                              }
                      }
      
   18                 pg->flags &= ~PG_BUSY;        /* new page */
                      UVM_PAGE_OWN(pg, NULL);
      
                      /*
                       * map it in
                       */
      
                      pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
                          prot, PMAP_KMPAGE);
                      loopva += PAGE_SIZE;
                      offset += PAGE_SIZE;
                      loopsize -= PAGE_SIZE;
              }
      
   18         pmap_update(pmap_kernel());
      
              if ((flags & UVM_KMF_ZERO) == 0) {
                      kleak_fill_area((void *)kva, size);
                      kmsan_orig((void *)kva, size, KMSAN_TYPE_UVM, __RET_ADDR);
                      kmsan_mark((void *)kva, size, KMSAN_STATE_UNINIT);
              }
      
              UVMHIST_LOG(maphist,"<- done (kva=0x%jx)", kva,0,0,0);
              return(kva);
      }
      
      /*
       * uvm_km_protect: change the protection of an allocated area
       */
      
      int
      uvm_km_protect(struct vm_map *map, vaddr_t addr, vsize_t size, vm_prot_t prot)
      {
              return uvm_map_protect(map, addr, addr + round_page(size), prot, false);
      }
      
      /*
       * uvm_km_free: free an area of kernel memory
       */
      
      void
      uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size, uvm_flag_t flags)
      {
              UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
      
   32         KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
                      (flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
                      (flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
   32         KASSERT((addr & PAGE_MASK) == 0);
   32         KASSERT(vm_map_pmap(map) == pmap_kernel());
      
   32         size = round_page(size);
      
              if (flags & UVM_KMF_PAGEABLE) {
   30                 uvm_km_pgremove(addr, addr + size);
    2         } else if (flags & UVM_KMF_WIRED) {
                      /*
                       * Note: uvm_km_pgremove_intrsafe() extracts mapping, thus
                       * remove it after.  See comment below about KVA visibility.
                       */
                      uvm_km_pgremove_intrsafe(map, addr, addr + size);
              }
      
              /*
               * Note: uvm_unmap_remove() calls pmap_update() for us, before
               * KVA becomes globally available.
               */
      
   32         uvm_unmap1(map, addr, addr + size, UVM_FLAG_VAONLY);
      }
      
      /* Sanity; must specify both or none. */
      #if (defined(PMAP_MAP_POOLPAGE) || defined(PMAP_UNMAP_POOLPAGE)) && \
          (!defined(PMAP_MAP_POOLPAGE) || !defined(PMAP_UNMAP_POOLPAGE))
      #error Must specify MAP and UNMAP together.
      #endif
      
      int
      uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags,
          vmem_addr_t *addr)
      {
              struct vm_page *pg;
  146         vmem_addr_t va;
              int rc;
              vaddr_t loopva;
              vsize_t loopsize;
      
              size = round_page(size);
      
      #if defined(PMAP_MAP_POOLPAGE)
              if (size == PAGE_SIZE) {
      again:
      #ifdef PMAP_ALLOC_POOLPAGE
                      pg = PMAP_ALLOC_POOLPAGE((flags & VM_SLEEP) ?
                         0 : UVM_PGA_USERESERVE);
      #else
                      pg = uvm_pagealloc(NULL, 0, NULL,
                         (flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE);
      #endif /* PMAP_ALLOC_POOLPAGE */
                      if (__predict_false(pg == NULL)) {
                              if (flags & VM_SLEEP) {
                                      uvm_wait("plpg");
                                      goto again;
                              }
                              return ENOMEM;
                      }
                      va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg));
                      KASSERT(va != 0);
                      *addr = va;
                      return 0;
              }
      #endif /* PMAP_MAP_POOLPAGE */
      
              rc = vmem_alloc(vm, size, flags, &va);
              if (rc != 0)
                      return rc;
      
      #ifdef PMAP_GROWKERNEL
              /*
               * These VA allocations happen independently of uvm_map 
               * so this allocation must not extend beyond the current limit.
               */
  146         KASSERTMSG(uvm_maxkaddr >= va + size,
                  "%#"PRIxVADDR" %#"PRIxPTR" %#zx",
                  uvm_maxkaddr, va, size);
      #endif
      
              loopva = va;
              loopsize = size;
      
  146         while (loopsize) {
  146                 paddr_t pa __diagused;
                      KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, &pa),
                          "loopva=%#"PRIxVADDR" loopsize=%#"PRIxVSIZE
                          " pa=%#"PRIxPADDR" vmem=%p",
                          loopva, loopsize, pa, vm);
      
  146                 pg = uvm_pagealloc(NULL, loopva, NULL,
                          UVM_FLAG_COLORMATCH
                          | ((flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE));
  146                 if (__predict_false(pg == NULL)) {
                              if (flags & VM_SLEEP) {
                                      uvm_wait("plpg");
                                      continue;
                              } else {
                                      uvm_km_pgremove_intrsafe(kernel_map, va,
                                          va + size);
                                      vmem_free(vm, va, size);
  146                                 return ENOMEM;
                              }
                      }
      
                      pg->flags &= ~PG_BUSY;        /* new page */
                      UVM_PAGE_OWN(pg, NULL);
                      pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
                          VM_PROT_READ|VM_PROT_WRITE, PMAP_KMPAGE);
      
                      loopva += PAGE_SIZE;
                      loopsize -= PAGE_SIZE;
              }
  146         pmap_update(pmap_kernel());
      
              *addr = va;
      
              return 0;
      }
      
      void
      uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, size_t size)
      {
      
   50         size = round_page(size);
      #if defined(PMAP_UNMAP_POOLPAGE)
              if (size == PAGE_SIZE) {
                      paddr_t pa;
      
                      pa = PMAP_UNMAP_POOLPAGE(addr);
                      uvm_pagefree(PHYS_TO_VM_PAGE(pa));
                      return;
              }
      #endif /* PMAP_UNMAP_POOLPAGE */
              uvm_km_pgremove_intrsafe(kernel_map, addr, addr + size);
              pmap_update(pmap_kernel());
      
              vmem_free(vm, addr, size);
      }
      
      bool
      uvm_km_va_starved_p(void)
      {
              vmem_size_t total;
              vmem_size_t free;
      
  321         if (kmem_arena == NULL)
                      return false;
      
  321         total = vmem_size(kmem_arena, VMEM_ALLOC|VMEM_FREE);
              free = vmem_size(kmem_arena, VMEM_FREE);
      
  321         return (free < (total / 10));
      }
      /* $NetBSD: subr_autoconf.c,v 1.265 2018/12/01 02:08:16 msaitoh Exp $ */
      
      /*
       * Copyright (c) 1996, 2000 Christopher G. Demetriou
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. All advertising materials mentioning features or use of this software
       *    must display the following acknowledgement:
       *          This product includes software developed for the
       *          NetBSD Project.  See http://www.NetBSD.org/ for
       *          information about NetBSD.
       * 4. The name of the author may not be used to endorse or promote products
       *    derived from this software without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       *
       * --(license Id: LICENSE.proto,v 1.1 2000/06/13 21:40:26 cgd Exp )--
       */
      
      /*
       * Copyright (c) 1992, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * This software was developed by the Computer Systems Engineering group
       * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
       * contributed to Berkeley.
       *
       * All advertising materials mentioning features or use of this software
       * must display the following acknowledgement:
       *        This product includes software developed by the University of
       *        California, Lawrence Berkeley Laboratories.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       * from: Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp  (LBL)
       *
       *        @(#)subr_autoconf.c        8.3 (Berkeley) 5/17/94
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.265 2018/12/01 02:08:16 msaitoh Exp $");
      
      #ifdef _KERNEL_OPT
      #include "opt_ddb.h"
      #include "drvctl.h"
      #endif
      
      #include <sys/param.h>
      #include <sys/device.h>
      #include <sys/disklabel.h>
      #include <sys/conf.h>
      #include <sys/kauth.h>
      #include <sys/kmem.h>
      #include <sys/systm.h>
      #include <sys/kernel.h>
      #include <sys/errno.h>
      #include <sys/proc.h>
      #include <sys/reboot.h>
      #include <sys/kthread.h>
      #include <sys/buf.h>
      #include <sys/dirent.h>
      #include <sys/mount.h>
      #include <sys/namei.h>
      #include <sys/unistd.h>
      #include <sys/fcntl.h>
      #include <sys/lockf.h>
      #include <sys/callout.h>
      #include <sys/devmon.h>
      #include <sys/cpu.h>
      #include <sys/sysctl.h>
      
      #include <sys/disk.h>
      
      #include <sys/rndsource.h>
      
      #include <machine/limits.h>
      
      /*
       * Autoconfiguration subroutines.
       */
      
      /*
       * Device autoconfiguration timings are mixed into the entropy pool.
       */
      extern krndsource_t rnd_autoconf_source;
      
      /*
       * ioconf.c exports exactly two names: cfdata and cfroots.  All system
       * devices and drivers are found via these tables.
       */
      extern struct cfdata cfdata[];
      extern const short cfroots[];
      
      /*
       * List of all cfdriver structures.  We use this to detect duplicates
       * when other cfdrivers are loaded.
       */
      struct cfdriverlist allcfdrivers = LIST_HEAD_INITIALIZER(&allcfdrivers);
      extern struct cfdriver * const cfdriver_list_initial[];
      
      /*
       * Initial list of cfattach's.
       */
      extern const struct cfattachinit cfattachinit[];
      
      /*
       * List of cfdata tables.  We always have one such list -- the one
       * built statically when the kernel was configured.
       */
      struct cftablelist allcftables = TAILQ_HEAD_INITIALIZER(allcftables);
      static struct cftable initcftable;
      
      #define        ROOT ((device_t)NULL)
      
      struct matchinfo {
              cfsubmatch_t fn;
              device_t parent;
              const int *locs;
              void        *aux;
              struct        cfdata *match;
              int        pri;
      };
      
      struct alldevs_foray {
              int                        af_s;
              struct devicelist        af_garbage;
      };
      
      static char *number(char *, int);
      static void mapply(struct matchinfo *, cfdata_t);
      static device_t config_devalloc(const device_t, const cfdata_t, const int *);
      static void config_devdelete(device_t);
      static void config_devunlink(device_t, struct devicelist *);
      static void config_makeroom(int, struct cfdriver *);
      static void config_devlink(device_t);
      static void config_alldevs_enter(struct alldevs_foray *);
      static void config_alldevs_exit(struct alldevs_foray *);
      static void config_add_attrib_dict(device_t);
      
      static void config_collect_garbage(struct devicelist *);
      static void config_dump_garbage(struct devicelist *);
      
      static void pmflock_debug(device_t, const char *, int);
      
      static device_t deviter_next1(deviter_t *);
      static void deviter_reinit(deviter_t *);
      
      struct deferred_config {
              TAILQ_ENTRY(deferred_config) dc_queue;
              device_t dc_dev;
              void (*dc_func)(device_t);
      };
      
      TAILQ_HEAD(deferred_config_head, deferred_config);
      
      static struct deferred_config_head deferred_config_queue =
              TAILQ_HEAD_INITIALIZER(deferred_config_queue);
      static struct deferred_config_head interrupt_config_queue =
              TAILQ_HEAD_INITIALIZER(interrupt_config_queue);
      static int interrupt_config_threads = 8;
      static struct deferred_config_head mountroot_config_queue =
              TAILQ_HEAD_INITIALIZER(mountroot_config_queue);
      static int mountroot_config_threads = 2;
      static lwp_t **mountroot_config_lwpids;
      static size_t mountroot_config_lwpids_size;
      bool root_is_mounted = false;
      
      static void config_process_deferred(struct deferred_config_head *, device_t);
      
      /* Hooks to finalize configuration once all real devices have been found. */
      struct finalize_hook {
              TAILQ_ENTRY(finalize_hook) f_list;
              int (*f_func)(device_t);
              device_t f_dev;
      };
      static TAILQ_HEAD(, finalize_hook) config_finalize_list =
              TAILQ_HEAD_INITIALIZER(config_finalize_list);
      static int config_finalize_done;
      
      /* list of all devices */
      static struct devicelist alldevs = TAILQ_HEAD_INITIALIZER(alldevs);
      static kmutex_t alldevs_lock __cacheline_aligned;
      static devgen_t alldevs_gen = 1;
      static int alldevs_nread = 0;
      static int alldevs_nwrite = 0;
      static bool alldevs_garbage = false;
      
      static int config_pending;                /* semaphore for mountroot */
      static kmutex_t config_misc_lock;
      static kcondvar_t config_misc_cv;
      
      static bool detachall = false;
      
      #define        STREQ(s1, s2)                        \
              (*(s1) == *(s2) && strcmp((s1), (s2)) == 0)
      
      static bool config_initialized = false;        /* config_init() has been called. */
      
      static int config_do_twiddle;
      static callout_t config_twiddle_ch;
      
      static void sysctl_detach_setup(struct sysctllog **);
      
      int no_devmon_insert(const char *, prop_dictionary_t);
      int (*devmon_insert_vec)(const char *, prop_dictionary_t) = no_devmon_insert;
      
      typedef int (*cfdriver_fn)(struct cfdriver *);
      static int
      frob_cfdrivervec(struct cfdriver * const *cfdriverv,
              cfdriver_fn drv_do, cfdriver_fn drv_undo,
              const char *style, bool dopanic)
      {
              void (*pr)(const char *, ...) __printflike(1, 2) =
                  dopanic ? panic : printf;
              int i, error = 0, e2 __diagused;
      
              for (i = 0; cfdriverv[i] != NULL; i++) {
                      if ((error = drv_do(cfdriverv[i])) != 0) {
                              pr("configure: `%s' driver %s failed: %d",
                                  cfdriverv[i]->cd_name, style, error);
                              goto bad;
                      }
              }
      
              KASSERT(error == 0);
              return 0;
      
       bad:
              printf("\n");
              for (i--; i >= 0; i--) {
                      e2 = drv_undo(cfdriverv[i]);
                      KASSERT(e2 == 0);
              }
      
              return error;
      }
      
      typedef int (*cfattach_fn)(const char *, struct cfattach *);
      static int
      frob_cfattachvec(const struct cfattachinit *cfattachv,
              cfattach_fn att_do, cfattach_fn att_undo,
              const char *style, bool dopanic)
      {
              const struct cfattachinit *cfai = NULL;
              void (*pr)(const char *, ...) __printflike(1, 2) =
                  dopanic ? panic : printf;
              int j = 0, error = 0, e2 __diagused;
      
              for (cfai = &cfattachv[0]; cfai->cfai_name != NULL; cfai++) {
                      for (j = 0; cfai->cfai_list[j] != NULL; j++) {
                              if ((error = att_do(cfai->cfai_name,
                                  cfai->cfai_list[j])) != 0) {
                                      pr("configure: attachment `%s' "
                                          "of `%s' driver %s failed: %d",
                                          cfai->cfai_list[j]->ca_name,
                                          cfai->cfai_name, style, error);
                                      goto bad;
                              }
                      }
              }
      
              KASSERT(error == 0);
              return 0;
      
       bad:
              /*
               * Rollback in reverse order.  dunno if super-important, but
               * do that anyway.  Although the code looks a little like
               * someone did a little integration (in the math sense).
               */
              printf("\n");
              if (cfai) {
                      bool last;
      
                      for (last = false; last == false; ) {
                              if (cfai == &cfattachv[0])
                                      last = true;
                              for (j--; j >= 0; j--) {
                                      e2 = att_undo(cfai->cfai_name,
                                          cfai->cfai_list[j]);
                                      KASSERT(e2 == 0);
                              }
                              if (!last) {
                                      cfai--;
                                      for (j = 0; cfai->cfai_list[j] != NULL; j++)
                                              ;
                              }
                      }
              }
      
              return error;
      }
      
      /*
       * Initialize the autoconfiguration data structures.  Normally this
       * is done by configure(), but some platforms need to do this very
       * early (to e.g. initialize the console).
       */
      void
      config_init(void)
      {
      
              KASSERT(config_initialized == false);
      
              mutex_init(&alldevs_lock, MUTEX_DEFAULT, IPL_VM);
      
              mutex_init(&config_misc_lock, MUTEX_DEFAULT, IPL_NONE);
              cv_init(&config_misc_cv, "cfgmisc");
      
              callout_init(&config_twiddle_ch, CALLOUT_MPSAFE);
      
              frob_cfdrivervec(cfdriver_list_initial,
                  config_cfdriver_attach, NULL, "bootstrap", true);
              frob_cfattachvec(cfattachinit,
                  config_cfattach_attach, NULL, "bootstrap", true);
      
              initcftable.ct_cfdata = cfdata;
              TAILQ_INSERT_TAIL(&allcftables, &initcftable, ct_list);
      
              config_initialized = true;
      }
      
      /*
       * Init or fini drivers and attachments.  Either all or none
       * are processed (via rollback).  It would be nice if this were
       * atomic to outside consumers, but with the current state of
       * locking ...
       */
      int
      config_init_component(struct cfdriver * const *cfdriverv,
              const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
      {
              int error;
      
              if ((error = frob_cfdrivervec(cfdriverv,
                  config_cfdriver_attach, config_cfdriver_detach, "init", false))!= 0)
                      return error;
              if ((error = frob_cfattachvec(cfattachv,
                  config_cfattach_attach, config_cfattach_detach,
                  "init", false)) != 0) {
                      frob_cfdrivervec(cfdriverv,
                          config_cfdriver_detach, NULL, "init rollback", true);
                      return error;
              }
              if ((error = config_cfdata_attach(cfdatav, 1)) != 0) {
                      frob_cfattachvec(cfattachv,
                          config_cfattach_detach, NULL, "init rollback", true);
                      frob_cfdrivervec(cfdriverv,
                          config_cfdriver_detach, NULL, "init rollback", true);
                      return error;
              }
      
              return 0;
      }
      
      int
      config_fini_component(struct cfdriver * const *cfdriverv,
              const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
      {
              int error;
      
              if ((error = config_cfdata_detach(cfdatav)) != 0)
                      return error;
              if ((error = frob_cfattachvec(cfattachv,
                  config_cfattach_detach, config_cfattach_attach,
                  "fini", false)) != 0) {
                      if (config_cfdata_attach(cfdatav, 0) != 0)
                              panic("config_cfdata fini rollback failed");
                      return error;
              }
              if ((error = frob_cfdrivervec(cfdriverv,
                  config_cfdriver_detach, config_cfdriver_attach,
                  "fini", false)) != 0) {
                      frob_cfattachvec(cfattachv,
                          config_cfattach_attach, NULL, "fini rollback", true);
                      if (config_cfdata_attach(cfdatav, 0) != 0)
                              panic("config_cfdata fini rollback failed");
                      return error;
              }
      
              return 0;
      }
      
      void
      config_init_mi(void)
      {
      
              if (!config_initialized)
                      config_init();
      
              sysctl_detach_setup(NULL);
      }
      
      void
      config_deferred(device_t dev)
      {
              config_process_deferred(&deferred_config_queue, dev);
              config_process_deferred(&interrupt_config_queue, dev);
              config_process_deferred(&mountroot_config_queue, dev);
      }
      
      static void
      config_interrupts_thread(void *cookie)
      {
              struct deferred_config *dc;
      
              while ((dc = TAILQ_FIRST(&interrupt_config_queue)) != NULL) {
                      TAILQ_REMOVE(&interrupt_config_queue, dc, dc_queue);
                      (*dc->dc_func)(dc->dc_dev);
                      dc->dc_dev->dv_flags &= ~DVF_ATTACH_INPROGRESS;
                      if (!device_pmf_is_registered(dc->dc_dev))
                              aprint_debug_dev(dc->dc_dev,
                                  "WARNING: power management not supported\n");
                      config_pending_decr(dc->dc_dev);
                      kmem_free(dc, sizeof(*dc));
              }
              kthread_exit(0);
      }
      
      void
      config_create_interruptthreads(void)
      {
              int i;
      
              for (i = 0; i < interrupt_config_threads; i++) {
                      (void)kthread_create(PRI_NONE, 0, NULL,
                          config_interrupts_thread, NULL, NULL, "configintr");
              }
      }
      
      static void
      config_mountroot_thread(void *cookie)
      {
              struct deferred_config *dc;
      
              while ((dc = TAILQ_FIRST(&mountroot_config_queue)) != NULL) {
                      TAILQ_REMOVE(&mountroot_config_queue, dc, dc_queue);
                      (*dc->dc_func)(dc->dc_dev);
                      kmem_free(dc, sizeof(*dc));
              }
              kthread_exit(0);
      }
      
      void
      config_create_mountrootthreads(void)
      {
              int i;
      
              if (!root_is_mounted)
                      root_is_mounted = true;
      
              mountroot_config_lwpids_size = sizeof(mountroot_config_lwpids) *
                                             mountroot_config_threads;
              mountroot_config_lwpids = kmem_alloc(mountroot_config_lwpids_size,
                                                   KM_NOSLEEP);
              KASSERT(mountroot_config_lwpids);
              for (i = 0; i < mountroot_config_threads; i++) {
                      mountroot_config_lwpids[i] = 0;
                      (void)kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL,
                                           config_mountroot_thread, NULL,
                                           &mountroot_config_lwpids[i],
                                           "configroot");
              }
      }
      
      void
      config_finalize_mountroot(void)
      {
              int i, error;
      
              for (i = 0; i < mountroot_config_threads; i++) {
                      if (mountroot_config_lwpids[i] == 0)
                              continue;
      
                      error = kthread_join(mountroot_config_lwpids[i]);
                      if (error)
                              printf("%s: thread %x joined with error %d\n",
                                     __func__, i, error);
              }
              kmem_free(mountroot_config_lwpids, mountroot_config_lwpids_size);
      }
      
      /*
       * Announce device attach/detach to userland listeners.
       */
      
      int
      no_devmon_insert(const char *name, prop_dictionary_t p)
      {
      
              return ENODEV;
      }
      
      static void
      devmon_report_device(device_t dev, bool isattach)
      {
              prop_dictionary_t ev;
              const char *parent;
              const char *what;
              device_t pdev = device_parent(dev);
      
              /* If currently no drvctl device, just return */
              if (devmon_insert_vec == no_devmon_insert)
                      return;
      
              ev = prop_dictionary_create();
              if (ev == NULL)
                      return;
      
              what = (isattach ? "device-attach" : "device-detach");
              parent = (pdev == NULL ? "root" : device_xname(pdev));
              if (!prop_dictionary_set_cstring(ev, "device", device_xname(dev)) ||
                  !prop_dictionary_set_cstring(ev, "parent", parent)) {
                      prop_object_release(ev);
                      return;
              }
      
              if ((*devmon_insert_vec)(what, ev) != 0)
                      prop_object_release(ev);
      }
      
      /*
       * Add a cfdriver to the system.
       */
      int
      config_cfdriver_attach(struct cfdriver *cd)
      {
              struct cfdriver *lcd;
      
              /* Make sure this driver isn't already in the system. */
              LIST_FOREACH(lcd, &allcfdrivers, cd_list) {
                      if (STREQ(lcd->cd_name, cd->cd_name))
                              return EEXIST;
              }
      
              LIST_INIT(&cd->cd_attach);
              LIST_INSERT_HEAD(&allcfdrivers, cd, cd_list);
      
              return 0;
      }
      
      /*
       * Remove a cfdriver from the system.
       */
      int
      config_cfdriver_detach(struct cfdriver *cd)
      {
              struct alldevs_foray af;
              int i, rc = 0;
      
              config_alldevs_enter(&af);
              /* Make sure there are no active instances. */
              for (i = 0; i < cd->cd_ndevs; i++) {
                      if (cd->cd_devs[i] != NULL) {
                              rc = EBUSY;
                              break;
                      }
              }
              config_alldevs_exit(&af);
      
              if (rc != 0)
                      return rc;
      
              /* ...and no attachments loaded. */
              if (LIST_EMPTY(&cd->cd_attach) == 0)
                      return EBUSY;
      
              LIST_REMOVE(cd, cd_list);
      
              KASSERT(cd->cd_devs == NULL);
      
              return 0;
      }
      
      /*
       * Look up a cfdriver by name.
       */
      struct cfdriver *
      config_cfdriver_lookup(const char *name)
      {
              struct cfdriver *cd;
      
              LIST_FOREACH(cd, &allcfdrivers, cd_list) {
                      if (STREQ(cd->cd_name, name))
                              return cd;
              }
      
              return NULL;
      }
      
      /*
       * Add a cfattach to the specified driver.
       */
      int
      config_cfattach_attach(const char *driver, struct cfattach *ca)
      {
              struct cfattach *lca;
              struct cfdriver *cd;
      
              cd = config_cfdriver_lookup(driver);
              if (cd == NULL)
                      return ESRCH;
      
              /* Make sure this attachment isn't already on this driver. */
              LIST_FOREACH(lca, &cd->cd_attach, ca_list) {
                      if (STREQ(lca->ca_name, ca->ca_name))
                              return EEXIST;
              }
      
              LIST_INSERT_HEAD(&cd->cd_attach, ca, ca_list);
      
              return 0;
      }
      
      /*
       * Remove a cfattach from the specified driver.
       */
      int
      config_cfattach_detach(const char *driver, struct cfattach *ca)
      {
              struct alldevs_foray af;
              struct cfdriver *cd;
              device_t dev;
              int i, rc = 0;
      
              cd = config_cfdriver_lookup(driver);
              if (cd == NULL)
                      return ESRCH;
      
              config_alldevs_enter(&af);
              /* Make sure there are no active instances. */
              for (i = 0; i < cd->cd_ndevs; i++) {
                      if ((dev = cd->cd_devs[i]) == NULL)
                              continue;
                      if (dev->dv_cfattach == ca) {
                              rc = EBUSY;
                              break;
                      }
              }
              config_alldevs_exit(&af);
      
              if (rc != 0)
                      return rc;
      
              LIST_REMOVE(ca, ca_list);
      
              return 0;
      }
      
      /*
       * Look up a cfattach by name.
       */
      static struct cfattach *
      config_cfattach_lookup_cd(struct cfdriver *cd, const char *atname)
      {
              struct cfattach *ca;
      
              LIST_FOREACH(ca, &cd->cd_attach, ca_list) {
                      if (STREQ(ca->ca_name, atname))
                              return ca;
              }
      
              return NULL;
      }
      
      /*
       * Look up a cfattach by driver/attachment name.
       */
      struct cfattach *
      config_cfattach_lookup(const char *name, const char *atname)
      {
              struct cfdriver *cd;
      
              cd = config_cfdriver_lookup(name);
              if (cd == NULL)
                      return NULL;
      
              return config_cfattach_lookup_cd(cd, atname);
      }
      
      /*
       * Apply the matching function and choose the best.  This is used
       * a few times and we want to keep the code small.
       */
      static void
      mapply(struct matchinfo *m, cfdata_t cf)
      {
              int pri;
      
              if (m->fn != NULL) {
                      pri = (*m->fn)(m->parent, cf, m->locs, m->aux);
              } else {
                      pri = config_match(m->parent, cf, m->aux);
              }
              if (pri > m->pri) {
                      m->match = cf;
                      m->pri = pri;
              }
      }
      
      int
      config_stdsubmatch(device_t parent, cfdata_t cf, const int *locs, void *aux)
      {
              const struct cfiattrdata *ci;
              const struct cflocdesc *cl;
              int nlocs, i;
      
              ci = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
              KASSERT(ci);
              nlocs = ci->ci_loclen;
              KASSERT(!nlocs || locs);
              for (i = 0; i < nlocs; i++) {
                      cl = &ci->ci_locdesc[i];
                      if (cl->cld_defaultstr != NULL &&
                          cf->cf_loc[i] == cl->cld_default)
                              continue;
                      if (cf->cf_loc[i] == locs[i])
                              continue;
                      return 0;
              }
      
              return config_match(parent, cf, aux);
      }
      
      /*
       * Helper function: check whether the driver supports the interface attribute
       * and return its descriptor structure.
       */
      static const struct cfiattrdata *
      cfdriver_get_iattr(const struct cfdriver *cd, const char *ia)
      {
              const struct cfiattrdata * const *cpp;
      
              if (cd->cd_attrs == NULL)
                      return 0;
      
              for (cpp = cd->cd_attrs; *cpp; cpp++) {
                      if (STREQ((*cpp)->ci_name, ia)) {
                              /* Match. */
                              return *cpp;
                      }
              }
              return 0;
      }
      
      /*
       * Lookup an interface attribute description by name.
       * If the driver is given, consider only its supported attributes.
       */
      const struct cfiattrdata *
      cfiattr_lookup(const char *name, const struct cfdriver *cd)
      {
              const struct cfdriver *d;
              const struct cfiattrdata *ia;
      
              if (cd)
                      return cfdriver_get_iattr(cd, name);
      
              LIST_FOREACH(d, &allcfdrivers, cd_list) {
                      ia = cfdriver_get_iattr(d, name);
                      if (ia)
                              return ia;
              }
              return 0;
      }
      
      /*
       * Determine if `parent' is a potential parent for a device spec based
       * on `cfp'.
       */
      static int
      cfparent_match(const device_t parent, const struct cfparent *cfp)
      {
              struct cfdriver *pcd;
      
              /* We don't match root nodes here. */
              if (cfp == NULL)
                      return 0;
      
              pcd = parent->dv_cfdriver;
              KASSERT(pcd != NULL);
      
              /*
               * First, ensure this parent has the correct interface
               * attribute.
               */
              if (!cfdriver_get_iattr(pcd, cfp->cfp_iattr))
                      return 0;
      
              /*
               * If no specific parent device instance was specified (i.e.
               * we're attaching to the attribute only), we're done!
               */
              if (cfp->cfp_parent == NULL)
                      return 1;
      
              /*
               * Check the parent device's name.
               */
              if (STREQ(pcd->cd_name, cfp->cfp_parent) == 0)
                      return 0;        /* not the same parent */
      
              /*
               * Make sure the unit number matches.
               */
              if (cfp->cfp_unit == DVUNIT_ANY ||        /* wildcard */
                  cfp->cfp_unit == parent->dv_unit)
                      return 1;
      
              /* Unit numbers don't match. */
              return 0;
      }
      
      /*
       * Helper for config_cfdata_attach(): check all devices whether it could be
       * parent any attachment in the config data table passed, and rescan.
       */
      static void
      rescan_with_cfdata(const struct cfdata *cf)
      {
              device_t d;
              const struct cfdata *cf1;
              deviter_t di;
      
      
              /*
               * "alldevs" is likely longer than a modules's cfdata, so make it
               * the outer loop.
               */
              for (d = deviter_first(&di, 0); d != NULL; d = deviter_next(&di)) {
      
                      if (!(d->dv_cfattach->ca_rescan))
                              continue;
      
                      for (cf1 = cf; cf1->cf_name; cf1++) {
      
                              if (!cfparent_match(d, cf1->cf_pspec))
                                      continue;
      
                              (*d->dv_cfattach->ca_rescan)(d,
                                      cfdata_ifattr(cf1), cf1->cf_loc);
      
                              config_deferred(d);
                      }
              }
              deviter_release(&di);
      }
      
      /*
       * Attach a supplemental config data table and rescan potential
       * parent devices if required.
       */
      int
      config_cfdata_attach(cfdata_t cf, int scannow)
      {
              struct cftable *ct;
      
              ct = kmem_alloc(sizeof(*ct), KM_SLEEP);
              ct->ct_cfdata = cf;
              TAILQ_INSERT_TAIL(&allcftables, ct, ct_list);
      
              if (scannow)
                      rescan_with_cfdata(cf);
      
              return 0;
      }
      
      /*
       * Helper for config_cfdata_detach: check whether a device is
       * found through any attachment in the config data table.
       */
      static int
      dev_in_cfdata(device_t d, cfdata_t cf)
      {
              const struct cfdata *cf1;
      
              for (cf1 = cf; cf1->cf_name; cf1++)
                      if (d->dv_cfdata == cf1)
                              return 1;
      
              return 0;
      }
      
      /*
       * Detach a supplemental config data table. Detach all devices found
       * through that table (and thus keeping references to it) before.
       */
      int
      config_cfdata_detach(cfdata_t cf)
      {
              device_t d;
              int error = 0;
              struct cftable *ct;
              deviter_t di;
      
              for (d = deviter_first(&di, DEVITER_F_RW); d != NULL;
                   d = deviter_next(&di)) {
                      if (!dev_in_cfdata(d, cf))
                              continue;
                      if ((error = config_detach(d, 0)) != 0)
                              break;
              }
              deviter_release(&di);
              if (error) {
                      aprint_error_dev(d, "unable to detach instance\n");
                      return error;
              }
      
              TAILQ_FOREACH(ct, &allcftables, ct_list) {
                      if (ct->ct_cfdata == cf) {
                              TAILQ_REMOVE(&allcftables, ct, ct_list);
                              kmem_free(ct, sizeof(*ct));
                              return 0;
                      }
              }
      
              /* not found -- shouldn't happen */
              return EINVAL;
      }
      
      /*
       * Invoke the "match" routine for a cfdata entry on behalf of
       * an external caller, usually a "submatch" routine.
       */
      int
      config_match(device_t parent, cfdata_t cf, void *aux)
      {
              struct cfattach *ca;
      
              ca = config_cfattach_lookup(cf->cf_name, cf->cf_atname);
              if (ca == NULL) {
                      /* No attachment for this entry, oh well. */
                      return 0;
              }
      
              return (*ca->ca_match)(parent, cf, aux);
      }
      
      /*
       * Iterate over all potential children of some device, calling the given
       * function (default being the child's match function) for each one.
       * Nonzero returns are matches; the highest value returned is considered
       * the best match.  Return the `found child' if we got a match, or NULL
       * otherwise.  The `aux' pointer is simply passed on through.
       *
       * Note that this function is designed so that it can be used to apply
       * an arbitrary function to all potential children (its return value
       * can be ignored).
       */
      cfdata_t
      config_search_loc(cfsubmatch_t fn, device_t parent,
                        const char *ifattr, const int *locs, void *aux)
      {
              struct cftable *ct;
              cfdata_t cf;
              struct matchinfo m;
      
              KASSERT(config_initialized);
              KASSERT(!ifattr || cfdriver_get_iattr(parent->dv_cfdriver, ifattr));
      
              m.fn = fn;
              m.parent = parent;
              m.locs = locs;
              m.aux = aux;
              m.match = NULL;
              m.pri = 0;
      
              TAILQ_FOREACH(ct, &allcftables, ct_list) {
                      for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
      
                              /* We don't match root nodes here. */
                              if (!cf->cf_pspec)
                                      continue;
      
                              /*
                               * Skip cf if no longer eligible, otherwise scan
                               * through parents for one matching `parent', and
                               * try match function.
                               */
                              if (cf->cf_fstate == FSTATE_FOUND)
                                      continue;
                              if (cf->cf_fstate == FSTATE_DNOTFOUND ||
                                  cf->cf_fstate == FSTATE_DSTAR)
                                      continue;
      
                              /*
                               * If an interface attribute was specified,
                               * consider only children which attach to
                               * that attribute.
                               */
                              if (ifattr && !STREQ(ifattr, cfdata_ifattr(cf)))
                                      continue;
      
                              if (cfparent_match(parent, cf->cf_pspec))
                                      mapply(&m, cf);
                      }
              }
              return m.match;
      }
      
      cfdata_t
      config_search_ia(cfsubmatch_t fn, device_t parent, const char *ifattr,
          void *aux)
      {
      
              return config_search_loc(fn, parent, ifattr, NULL, aux);
      }
      
      /*
       * Find the given root device.
       * This is much like config_search, but there is no parent.
       * Don't bother with multiple cfdata tables; the root node
       * must always be in the initial table.
       */
      cfdata_t
      config_rootsearch(cfsubmatch_t fn, const char *rootname, void *aux)
      {
              cfdata_t cf;
              const short *p;
              struct matchinfo m;
      
              m.fn = fn;
              m.parent = ROOT;
              m.aux = aux;
              m.match = NULL;
              m.pri = 0;
              m.locs = 0;
              /*
               * Look at root entries for matching name.  We do not bother
               * with found-state here since only one root should ever be
               * searched (and it must be done first).
               */
              for (p = cfroots; *p >= 0; p++) {
                      cf = &cfdata[*p];
                      if (strcmp(cf->cf_name, rootname) == 0)
                              mapply(&m, cf);
              }
              return m.match;
      }
      
      static const char * const msgs[3] = { "", " not configured\n", " unsupported\n" };
      
      /*
       * The given `aux' argument describes a device that has been found
       * on the given parent, but not necessarily configured.  Locate the
       * configuration data for that device (using the submatch function
       * provided, or using candidates' cd_match configuration driver
       * functions) and attach it, and return its device_t.  If the device was
       * not configured, call the given `print' function and return NULL.
       */
      device_t
      config_found_sm_loc(device_t parent,
                      const char *ifattr, const int *locs, void *aux,
                      cfprint_t print, cfsubmatch_t submatch)
      {
              cfdata_t cf;
      
              if ((cf = config_search_loc(submatch, parent, ifattr, locs, aux)))
                      return(config_attach_loc(parent, cf, locs, aux, print));
              if (print) {
                      if (config_do_twiddle && cold)
                              twiddle();
                      aprint_normal("%s", msgs[(*print)(aux, device_xname(parent))]);
              }
      
              /*
               * This has the effect of mixing in a single timestamp to the
               * entropy pool.  Experiments indicate the estimator will almost
               * always attribute one bit of entropy to this sample; analysis
               * of device attach/detach timestamps on FreeBSD indicates 4
               * bits of entropy/sample so this seems appropriately conservative.
               */
              rnd_add_uint32(&rnd_autoconf_source, 0);
              return NULL;
      }
      
      device_t
      config_found_ia(device_t parent, const char *ifattr, void *aux,
          cfprint_t print)
      {
      
              return config_found_sm_loc(parent, ifattr, NULL, aux, print, NULL);
      }
      
      device_t
      config_found(device_t parent, void *aux, cfprint_t print)
      {
      
              return config_found_sm_loc(parent, NULL, NULL, aux, print, NULL);
      }
      
      /*
       * As above, but for root devices.
       */
      device_t
      config_rootfound(const char *rootname, void *aux)
      {
              cfdata_t cf;
      
              if ((cf = config_rootsearch(NULL, rootname, aux)) != NULL)
                      return config_attach(ROOT, cf, aux, NULL);
              aprint_error("root device %s not configured\n", rootname);
              return NULL;
      }
      
      /* just like sprintf(buf, "%d") except that it works from the end */
      static char *
      number(char *ep, int n)
      {
      
              *--ep = 0;
              while (n >= 10) {
                      *--ep = (n % 10) + '0';
                      n /= 10;
              }
              *--ep = n + '0';
              return ep;
      }
      
      /*
       * Expand the size of the cd_devs array if necessary.
       *
       * The caller must hold alldevs_lock. config_makeroom() may release and
       * re-acquire alldevs_lock, so callers should re-check conditions such
       * as alldevs_nwrite == 0 and alldevs_nread == 0 when config_makeroom()
       * returns.
       */
      static void
      config_makeroom(int n, struct cfdriver *cd)
      {
              int ondevs, nndevs;
              device_t *osp, *nsp;
      
              KASSERT(mutex_owned(&alldevs_lock));
              alldevs_nwrite++;
      
              for (nndevs = MAX(4, cd->cd_ndevs); nndevs <= n; nndevs += nndevs)
                      ;
      
              while (n >= cd->cd_ndevs) {
                      /*
                       * Need to expand the array.
                       */
                      ondevs = cd->cd_ndevs;
                      osp = cd->cd_devs;
      
                      /*
                       * Release alldevs_lock around allocation, which may
                       * sleep.
                       */
                      mutex_exit(&alldevs_lock);
                      nsp = kmem_alloc(sizeof(device_t[nndevs]), KM_SLEEP);
                      mutex_enter(&alldevs_lock);
      
                      /*
                       * If another thread moved the array while we did
                       * not hold alldevs_lock, try again.
                       */
                      if (cd->cd_devs != osp) {
                              mutex_exit(&alldevs_lock);
                              kmem_free(nsp, sizeof(device_t[nndevs]));
                              mutex_enter(&alldevs_lock);
                              continue;
                      }
      
                      memset(nsp + ondevs, 0, sizeof(device_t[nndevs - ondevs]));
                      if (ondevs != 0)
                              memcpy(nsp, cd->cd_devs, sizeof(device_t[ondevs]));
      
                      cd->cd_ndevs = nndevs;
                      cd->cd_devs = nsp;
                      if (ondevs != 0) {
                              mutex_exit(&alldevs_lock);
                              kmem_free(osp, sizeof(device_t[ondevs]));
                              mutex_enter(&alldevs_lock);
                      }
              }
              KASSERT(mutex_owned(&alldevs_lock));
              alldevs_nwrite--;
      }
      
      /*
       * Put dev into the devices list.
       */
      static void
      config_devlink(device_t dev)
      {
      
              mutex_enter(&alldevs_lock);
      
              KASSERT(device_cfdriver(dev)->cd_devs[dev->dv_unit] == dev);
      
              dev->dv_add_gen = alldevs_gen;
              /* It is safe to add a device to the tail of the list while
               * readers and writers are in the list.
               */
              TAILQ_INSERT_TAIL(&alldevs, dev, dv_list);
              mutex_exit(&alldevs_lock);
      }
      
      static void
      config_devfree(device_t dev)
      {
              int priv = (dev->dv_flags & DVF_PRIV_ALLOC);
      
              if (dev->dv_cfattach->ca_devsize > 0)
                      kmem_free(dev->dv_private, dev->dv_cfattach->ca_devsize);
              if (priv)
                      kmem_free(dev, sizeof(*dev));
      }
      
      /*
       * Caller must hold alldevs_lock.
       */
      static void
      config_devunlink(device_t dev, struct devicelist *garbage)
      {
              struct device_garbage *dg = &dev->dv_garbage;
              cfdriver_t cd = device_cfdriver(dev);
              int i;
      
              KASSERT(mutex_owned(&alldevs_lock));
      
               /* Unlink from device list.  Link to garbage list. */
              TAILQ_REMOVE(&alldevs, dev, dv_list);
              TAILQ_INSERT_TAIL(garbage, dev, dv_list);
      
              /* Remove from cfdriver's array. */
              cd->cd_devs[dev->dv_unit] = NULL;
      
              /*
               * If the device now has no units in use, unlink its softc array.
               */
              for (i = 0; i < cd->cd_ndevs; i++) {
                      if (cd->cd_devs[i] != NULL)
                              break;
              }
              /* Nothing found.  Unlink, now.  Deallocate, later. */
              if (i == cd->cd_ndevs) {
                      dg->dg_ndevs = cd->cd_ndevs;
                      dg->dg_devs = cd->cd_devs;
                      cd->cd_devs = NULL;
                      cd->cd_ndevs = 0;
              }
      }
      
      static void
      config_devdelete(device_t dev)
      {
              struct device_garbage *dg = &dev->dv_garbage;
              device_lock_t dvl = device_getlock(dev);
      
              if (dg->dg_devs != NULL)
                      kmem_free(dg->dg_devs, sizeof(device_t[dg->dg_ndevs]));
      
              cv_destroy(&dvl->dvl_cv);
              mutex_destroy(&dvl->dvl_mtx);
      
              KASSERT(dev->dv_properties != NULL);
              prop_object_release(dev->dv_properties);
      
              if (dev->dv_activity_handlers)
                      panic("%s with registered handlers", __func__);
      
              if (dev->dv_locators) {
                      size_t amount = *--dev->dv_locators;
                      kmem_free(dev->dv_locators, amount);
              }
      
              config_devfree(dev);
      }
      
      static int
      config_unit_nextfree(cfdriver_t cd, cfdata_t cf)
      {
              int unit;
      
              if (cf->cf_fstate == FSTATE_STAR) {
                      for (unit = cf->cf_unit; unit < cd->cd_ndevs; unit++)
                              if (cd->cd_devs[unit] == NULL)
                                      break;
                      /*
                       * unit is now the unit of the first NULL device pointer,
                       * or max(cd->cd_ndevs,cf->cf_unit).
                       */
              } else {
                      unit = cf->cf_unit;
                      if (unit < cd->cd_ndevs && cd->cd_devs[unit] != NULL)
                              unit = -1;
              }
              return unit;
      }
      
      static int
      config_unit_alloc(device_t dev, cfdriver_t cd, cfdata_t cf)
      {
              struct alldevs_foray af;
              int unit;
      
              config_alldevs_enter(&af);
              for (;;) {
                      unit = config_unit_nextfree(cd, cf);
                      if (unit == -1)
                              break;
                      if (unit < cd->cd_ndevs) {
                              cd->cd_devs[unit] = dev;
                              dev->dv_unit = unit;
                              break;
                      }
                      config_makeroom(unit, cd);
              }
              config_alldevs_exit(&af);
      
              return unit;
      }
      
      static device_t
      config_devalloc(const device_t parent, const cfdata_t cf, const int *locs)
      {
              cfdriver_t cd;
              cfattach_t ca;
              size_t lname, lunit;
              const char *xunit;
              int myunit;
              char num[10];
              device_t dev;
              void *dev_private;
              const struct cfiattrdata *ia;
              device_lock_t dvl;
      
              cd = config_cfdriver_lookup(cf->cf_name);
              if (cd == NULL)
                      return NULL;
      
              ca = config_cfattach_lookup_cd(cd, cf->cf_atname);
              if (ca == NULL)
                      return NULL;
      
              /* get memory for all device vars */
              KASSERTMSG((ca->ca_flags & DVF_PRIV_ALLOC)
                  || ca->ca_devsize >= sizeof(struct device),
                  "%s: %s (%zu < %zu)", __func__, cf->cf_atname, ca->ca_devsize,
                  sizeof(struct device));
              if (ca->ca_devsize > 0) {
                      dev_private = kmem_zalloc(ca->ca_devsize, KM_SLEEP);
              } else {
                      KASSERT(ca->ca_flags & DVF_PRIV_ALLOC);
                      dev_private = NULL;
              }
      
              if ((ca->ca_flags & DVF_PRIV_ALLOC) != 0) {
                      dev = kmem_zalloc(sizeof(*dev), KM_SLEEP);
              } else {
                      dev = dev_private;
      #ifdef DIAGNOSTIC
                      printf("%s has not been converted to device_t\n", cd->cd_name);
      #endif
                      KASSERT(dev != NULL);
              }
              dev->dv_class = cd->cd_class;
              dev->dv_cfdata = cf;
              dev->dv_cfdriver = cd;
              dev->dv_cfattach = ca;
              dev->dv_activity_count = 0;
              dev->dv_activity_handlers = NULL;
              dev->dv_private = dev_private;
              dev->dv_flags = ca->ca_flags;        /* inherit flags from class */
      
              myunit = config_unit_alloc(dev, cd, cf);
              if (myunit == -1) {
                      config_devfree(dev);
                      return NULL;
              }
      
              /* compute length of name and decimal expansion of unit number */
              lname = strlen(cd->cd_name);
              xunit = number(&num[sizeof(num)], myunit);
              lunit = &num[sizeof(num)] - xunit;
              if (lname + lunit > sizeof(dev->dv_xname))
                      panic("config_devalloc: device name too long");
      
              dvl = device_getlock(dev);
      
              mutex_init(&dvl->dvl_mtx, MUTEX_DEFAULT, IPL_NONE);
              cv_init(&dvl->dvl_cv, "pmfsusp");
      
              memcpy(dev->dv_xname, cd->cd_name, lname);
              memcpy(dev->dv_xname + lname, xunit, lunit);
              dev->dv_parent = parent;
              if (parent != NULL)
                      dev->dv_depth = parent->dv_depth + 1;
              else
                      dev->dv_depth = 0;
              dev->dv_flags |= DVF_ACTIVE;        /* always initially active */
              if (locs) {
                      KASSERT(parent); /* no locators at root */
                      ia = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
                      dev->dv_locators =
                          kmem_alloc(sizeof(int [ia->ci_loclen + 1]), KM_SLEEP);
                      *dev->dv_locators++ = sizeof(int [ia->ci_loclen + 1]);
                      memcpy(dev->dv_locators, locs, sizeof(int [ia->ci_loclen]));
              }
              dev->dv_properties = prop_dictionary_create();
              KASSERT(dev->dv_properties != NULL);
      
              prop_dictionary_set_cstring_nocopy(dev->dv_properties,
                  "device-driver", dev->dv_cfdriver->cd_name);
              prop_dictionary_set_uint16(dev->dv_properties,
                  "device-unit", dev->dv_unit);
              if (parent != NULL) {
                      prop_dictionary_set_cstring(dev->dv_properties,
                          "device-parent", device_xname(parent));
              }
      
              if (dev->dv_cfdriver->cd_attrs != NULL)
                      config_add_attrib_dict(dev);
      
              return dev;
      }
      
      /*
       * Create an array of device attach attributes and add it
       * to the device's dv_properties dictionary.
       *
       * <key>interface-attributes</key>
       * <array>
       *    <dict>
       *       <key>attribute-name</key>
       *       <string>foo</string>
       *       <key>locators</key>
       *       <array>
       *          <dict>
       *             <key>loc-name</key>
       *             <string>foo-loc1</string>
       *          </dict>
       *          <dict>
       *             <key>loc-name</key>
       *             <string>foo-loc2</string>
       *             <key>default</key>
       *             <string>foo-loc2-default</string>
       *          </dict>
       *          ...
       *       </array>
       *    </dict>
       *    ...
       * </array>
       */
      
      static void
      config_add_attrib_dict(device_t dev)
      {
              int i, j;
              const struct cfiattrdata *ci;
              prop_dictionary_t attr_dict, loc_dict;
              prop_array_t attr_array, loc_array;
      
              if ((attr_array = prop_array_create()) == NULL)
                      return;
      
              for (i = 0; ; i++) {
                      if ((ci = dev->dv_cfdriver->cd_attrs[i]) == NULL)
                              break;
                      if ((attr_dict = prop_dictionary_create()) == NULL)
                              break;
                      prop_dictionary_set_cstring_nocopy(attr_dict, "attribute-name",
                          ci->ci_name);
      
                      /* Create an array of the locator names and defaults */
      
                      if (ci->ci_loclen != 0 &&
                          (loc_array = prop_array_create()) != NULL) {
                              for (j = 0; j < ci->ci_loclen; j++) {
                                      loc_dict = prop_dictionary_create();
                                      if (loc_dict == NULL)
                                              continue;
                                      prop_dictionary_set_cstring_nocopy(loc_dict,
                                          "loc-name", ci->ci_locdesc[j].cld_name);
                                      if (ci->ci_locdesc[j].cld_defaultstr != NULL)
                                              prop_dictionary_set_cstring_nocopy(
                                                  loc_dict, "default",
                                                  ci->ci_locdesc[j].cld_defaultstr);
                                      prop_array_set(loc_array, j, loc_dict);
                                      prop_object_release(loc_dict);
                              }
                              prop_dictionary_set_and_rel(attr_dict, "locators",
                                  loc_array);
                      }
                      prop_array_add(attr_array, attr_dict);
                      prop_object_release(attr_dict);
              }
              if (i == 0)
                      prop_object_release(attr_array);
              else
                      prop_dictionary_set_and_rel(dev->dv_properties,
                          "interface-attributes", attr_array);
      
              return;
      }
      
      /*
       * Attach a found device.
       */
      device_t
      config_attach_loc(device_t parent, cfdata_t cf,
              const int *locs, void *aux, cfprint_t print)
      {
              device_t dev;
              struct cftable *ct;
              const char *drvname;
      
              dev = config_devalloc(parent, cf, locs);
              if (!dev)
                      panic("config_attach: allocation of device softc failed");
      
              /* XXX redundant - see below? */
              if (cf->cf_fstate != FSTATE_STAR) {
                      KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
                      cf->cf_fstate = FSTATE_FOUND;
              }
      
              config_devlink(dev);
      
              if (config_do_twiddle && cold)
                      twiddle();
              else
                      aprint_naive("Found ");
              /*
               * We want the next two printfs for normal, verbose, and quiet,
               * but not silent (in which case, we're twiddling, instead).
               */
              if (parent == ROOT) {
                      aprint_naive("%s (root)", device_xname(dev));
                      aprint_normal("%s (root)", device_xname(dev));
              } else {
                      aprint_naive("%s at %s", device_xname(dev),
                          device_xname(parent));
                      aprint_normal("%s at %s", device_xname(dev),
                          device_xname(parent));
                      if (print)
                              (void) (*print)(aux, NULL);
              }
      
              /*
               * Before attaching, clobber any unfound devices that are
               * otherwise identical.
               * XXX code above is redundant?
               */
              drvname = dev->dv_cfdriver->cd_name;
              TAILQ_FOREACH(ct, &allcftables, ct_list) {
                      for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
                              if (STREQ(cf->cf_name, drvname) &&
                                  cf->cf_unit == dev->dv_unit) {
                                      if (cf->cf_fstate == FSTATE_NOTFOUND)
                                              cf->cf_fstate = FSTATE_FOUND;
                              }
                      }
              }
              device_register(dev, aux);
      
              /* Let userland know */
              devmon_report_device(dev, true);
      
              (*dev->dv_cfattach->ca_attach)(parent, dev, aux);
      
              if (((dev->dv_flags & DVF_ATTACH_INPROGRESS) == 0)
                  && !device_pmf_is_registered(dev))
                      aprint_debug_dev(dev,
                          "WARNING: power management not supported\n");
      
              config_process_deferred(&deferred_config_queue, dev);
      
              device_register_post_config(dev, aux);
              return dev;
      }
      
      device_t
      config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print)
      {
      
              return config_attach_loc(parent, cf, NULL, aux, print);
      }
      
      /*
       * As above, but for pseudo-devices.  Pseudo-devices attached in this
       * way are silently inserted into the device tree, and their children
       * attached.
       *
       * Note that because pseudo-devices are attached silently, any information
       * the attach routine wishes to print should be prefixed with the device
       * name by the attach routine.
       */
      device_t
      config_attach_pseudo(cfdata_t cf)
      {
              device_t dev;
      
              dev = config_devalloc(ROOT, cf, NULL);
              if (!dev)
                      return NULL;
      
              /* XXX mark busy in cfdata */
      
              if (cf->cf_fstate != FSTATE_STAR) {
                      KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
                      cf->cf_fstate = FSTATE_FOUND;
              }
      
              config_devlink(dev);
      
      #if 0        /* XXXJRT not yet */
              device_register(dev, NULL);        /* like a root node */
      #endif
      
              /* Let userland know */
              devmon_report_device(dev, true);
      
              (*dev->dv_cfattach->ca_attach)(ROOT, dev, NULL);
      
              config_process_deferred(&deferred_config_queue, dev);
              return dev;
      }
      
      /*
       * Caller must hold alldevs_lock.
       */
      static void
      config_collect_garbage(struct devicelist *garbage)
      {
              device_t dv;
      
              KASSERT(!cpu_intr_p());
              KASSERT(!cpu_softintr_p());
              KASSERT(mutex_owned(&alldevs_lock));
      
              while (alldevs_nwrite == 0 && alldevs_nread == 0 && alldevs_garbage) {
                      TAILQ_FOREACH(dv, &alldevs, dv_list) {
                              if (dv->dv_del_gen != 0)
                                      break;
                      }
                      if (dv == NULL) {
                              alldevs_garbage = false;
                              break;
                      }
                      config_devunlink(dv, garbage);
              }
              KASSERT(mutex_owned(&alldevs_lock));
      }
      
      static void
      config_dump_garbage(struct devicelist *garbage)
      {
              device_t dv;
      
              while ((dv = TAILQ_FIRST(garbage)) != NULL) {
                      TAILQ_REMOVE(garbage, dv, dv_list);
                      config_devdelete(dv);
              }
      }
      
      /*
       * Detach a device.  Optionally forced (e.g. because of hardware
       * removal) and quiet.  Returns zero if successful, non-zero
       * (an error code) otherwise.
       *
       * Note that this code wants to be run from a process context, so
       * that the detach can sleep to allow processes which have a device
       * open to run and unwind their stacks.
       */
      int
      config_detach(device_t dev, int flags)
      {
              struct alldevs_foray af;
              struct cftable *ct;
              cfdata_t cf;
              const struct cfattach *ca;
              struct cfdriver *cd;
              device_t d __diagused;
              int rv = 0;
      
              cf = dev->dv_cfdata;
              KASSERTMSG((cf == NULL || cf->cf_fstate == FSTATE_FOUND ||
                      cf->cf_fstate == FSTATE_STAR),
                  "config_detach: %s: bad device fstate: %d",
                  device_xname(dev), cf ? cf->cf_fstate : -1);
      
              cd = dev->dv_cfdriver;
              KASSERT(cd != NULL);
      
              ca = dev->dv_cfattach;
              KASSERT(ca != NULL);
      
              mutex_enter(&alldevs_lock);
              if (dev->dv_del_gen != 0) {
                      mutex_exit(&alldevs_lock);
      #ifdef DIAGNOSTIC
                      printf("%s: %s is already detached\n", __func__,
                          device_xname(dev));
      #endif /* DIAGNOSTIC */
                      return ENOENT;
              }
              alldevs_nwrite++;
              mutex_exit(&alldevs_lock);
      
              if (!detachall &&
                  (flags & (DETACH_SHUTDOWN|DETACH_FORCE)) == DETACH_SHUTDOWN &&
                  (dev->dv_flags & DVF_DETACH_SHUTDOWN) == 0) {
                      rv = EOPNOTSUPP;
              } else if (ca->ca_detach != NULL) {
                      rv = (*ca->ca_detach)(dev, flags);
              } else
                      rv = EOPNOTSUPP;
      
              /*
               * If it was not possible to detach the device, then we either
               * panic() (for the forced but failed case), or return an error.
               *
               * If it was possible to detach the device, ensure that the
               * device is deactivated.
               */
              if (rv == 0)
                      dev->dv_flags &= ~DVF_ACTIVE;
              else if ((flags & DETACH_FORCE) == 0)
                      goto out;
              else {
                      panic("config_detach: forced detach of %s failed (%d)",
                          device_xname(dev), rv);
              }
      
              /*
               * The device has now been successfully detached.
               */
      
              /* Let userland know */
              devmon_report_device(dev, false);
      
      #ifdef DIAGNOSTIC
              /*
               * Sanity: If you're successfully detached, you should have no
               * children.  (Note that because children must be attached
               * after parents, we only need to search the latter part of
               * the list.)
               */
              for (d = TAILQ_NEXT(dev, dv_list); d != NULL;
                  d = TAILQ_NEXT(d, dv_list)) {
                      if (d->dv_parent == dev && d->dv_del_gen == 0) {
                              printf("config_detach: detached device %s"
                                  " has children %s\n", device_xname(dev),
                                  device_xname(d));
                              panic("config_detach");
                      }
              }
      #endif
      
              /* notify the parent that the child is gone */
              if (dev->dv_parent) {
                      device_t p = dev->dv_parent;
                      if (p->dv_cfattach->ca_childdetached)
                              (*p->dv_cfattach->ca_childdetached)(p, dev);
              }
      
              /*
               * Mark cfdata to show that the unit can be reused, if possible.
               */
              TAILQ_FOREACH(ct, &allcftables, ct_list) {
                      for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
                              if (STREQ(cf->cf_name, cd->cd_name)) {
                                      if (cf->cf_fstate == FSTATE_FOUND &&
                                          cf->cf_unit == dev->dv_unit)
                                              cf->cf_fstate = FSTATE_NOTFOUND;
                              }
                      }
              }
      
              if (dev->dv_cfdata != NULL && (flags & DETACH_QUIET) == 0)
                      aprint_normal_dev(dev, "detached\n");
      
      out:
              config_alldevs_enter(&af);
              KASSERT(alldevs_nwrite != 0);
              --alldevs_nwrite;
              if (rv == 0 && dev->dv_del_gen == 0) {
                      if (alldevs_nwrite == 0 && alldevs_nread == 0)
                              config_devunlink(dev, &af.af_garbage);
                      else {
                              dev->dv_del_gen = alldevs_gen;
                              alldevs_garbage = true;
                      }
              }
              config_alldevs_exit(&af);
      
              return rv;
      }
      
      int
      config_detach_children(device_t parent, int flags)
      {
              device_t dv;
              deviter_t di;
              int error = 0;
      
              for (dv = deviter_first(&di, DEVITER_F_RW); dv != NULL;
                   dv = deviter_next(&di)) {
                      if (device_parent(dv) != parent)
                              continue;
                      if ((error = config_detach(dv, flags)) != 0)
                              break;
              }
              deviter_release(&di);
              return error;
      }
      
      device_t
      shutdown_first(struct shutdown_state *s)
      {
              if (!s->initialized) {
                      deviter_init(&s->di, DEVITER_F_SHUTDOWN|DEVITER_F_LEAVES_FIRST);
                      s->initialized = true;
              }
              return shutdown_next(s);
      }
      
      device_t
      shutdown_next(struct shutdown_state *s)
      {
              device_t dv;
      
              while ((dv = deviter_next(&s->di)) != NULL && !device_is_active(dv))
                      ;
      
              if (dv == NULL)
                      s->initialized = false;
      
              return dv;
      }
      
      bool
      config_detach_all(int how)
      {
              static struct shutdown_state s;
              device_t curdev;
              bool progress = false;
              int flags;
      
              if ((how & (RB_NOSYNC|RB_DUMP)) != 0)
                      return false;
      
              if ((how & RB_POWERDOWN) == RB_POWERDOWN)
                      flags = DETACH_SHUTDOWN | DETACH_POWEROFF;
              else
                      flags = DETACH_SHUTDOWN;
      
              for (curdev = shutdown_first(&s); curdev != NULL;
                   curdev = shutdown_next(&s)) {
                      aprint_debug(" detaching %s, ", device_xname(curdev));
                      if (config_detach(curdev, flags) == 0) {
                              progress = true;
                              aprint_debug("success.");
                      } else
                              aprint_debug("failed.");
              }
              return progress;
      }
      
      static bool
      device_is_ancestor_of(device_t ancestor, device_t descendant)
      {
              device_t dv;
      
              for (dv = descendant; dv != NULL; dv = device_parent(dv)) {
                      if (device_parent(dv) == ancestor)
                              return true;
              }
              return false;
      }
      
      int
      config_deactivate(device_t dev)
      {
              deviter_t di;
              const struct cfattach *ca;
              device_t descendant;
              int s, rv = 0, oflags;
      
              for (descendant = deviter_first(&di, DEVITER_F_ROOT_FIRST);
                   descendant != NULL;
                   descendant = deviter_next(&di)) {
                      if (dev != descendant &&
                          !device_is_ancestor_of(dev, descendant))
                              continue;
      
                      if ((descendant->dv_flags & DVF_ACTIVE) == 0)
                              continue;
      
                      ca = descendant->dv_cfattach;
                      oflags = descendant->dv_flags;
      
                      descendant->dv_flags &= ~DVF_ACTIVE;
                      if (ca->ca_activate == NULL)
                              continue;
                      s = splhigh();
                      rv = (*ca->ca_activate)(descendant, DVACT_DEACTIVATE);
                      splx(s);
                      if (rv != 0)
                              descendant->dv_flags = oflags;
              }
              deviter_release(&di);
              return rv;
      }
      
      /*
       * Defer the configuration of the specified device until all
       * of its parent's devices have been attached.
       */
      void
      config_defer(device_t dev, void (*func)(device_t))
      {
              struct deferred_config *dc;
      
              if (dev->dv_parent == NULL)
                      panic("config_defer: can't defer config of a root device");
      
      #ifdef DIAGNOSTIC
              TAILQ_FOREACH(dc, &deferred_config_queue, dc_queue) {
                      if (dc->dc_dev == dev)
                              panic("config_defer: deferred twice");
              }
      #endif
      
              dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
              dc->dc_dev = dev;
              dc->dc_func = func;
              TAILQ_INSERT_TAIL(&deferred_config_queue, dc, dc_queue);
              config_pending_incr(dev);
      }
      
      /*
       * Defer some autoconfiguration for a device until after interrupts
       * are enabled.
       */
      void
      config_interrupts(device_t dev, void (*func)(device_t))
      {
              struct deferred_config *dc;
      
              /*
               * If interrupts are enabled, callback now.
               */
              if (cold == 0) {
                      (*func)(dev);
                      return;
              }
      
      #ifdef DIAGNOSTIC
              TAILQ_FOREACH(dc, &interrupt_config_queue, dc_queue) {
                      if (dc->dc_dev == dev)
                              panic("config_interrupts: deferred twice");
              }
      #endif
      
              dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
              dc->dc_dev = dev;
              dc->dc_func = func;
              TAILQ_INSERT_TAIL(&interrupt_config_queue, dc, dc_queue);
              config_pending_incr(dev);
              dev->dv_flags |= DVF_ATTACH_INPROGRESS;
      }
      
      /*
       * Defer some autoconfiguration for a device until after root file system
       * is mounted (to load firmware etc).
       */
      void
      config_mountroot(device_t dev, void (*func)(device_t))
      {
              struct deferred_config *dc;
      
              /*
               * If root file system is mounted, callback now.
               */
              if (root_is_mounted) {
                      (*func)(dev);
                      return;
              }
      
      #ifdef DIAGNOSTIC
              TAILQ_FOREACH(dc, &mountroot_config_queue, dc_queue) {
                      if (dc->dc_dev == dev)
                              panic("%s: deferred twice", __func__);
              }
      #endif
      
              dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
              dc->dc_dev = dev;
              dc->dc_func = func;
              TAILQ_INSERT_TAIL(&mountroot_config_queue, dc, dc_queue);
      }
      
      /*
       * Process a deferred configuration queue.
       */
      static void
      config_process_deferred(struct deferred_config_head *queue, device_t parent)
      {
              struct deferred_config *dc, *ndc;
      
              for (dc = TAILQ_FIRST(queue); dc != NULL; dc = ndc) {
                      ndc = TAILQ_NEXT(dc, dc_queue);
                      if (parent == NULL || dc->dc_dev->dv_parent == parent) {
                              TAILQ_REMOVE(queue, dc, dc_queue);
                              (*dc->dc_func)(dc->dc_dev);
                              config_pending_decr(dc->dc_dev);
                              kmem_free(dc, sizeof(*dc));
                      }
              }
      }
      
      /*
       * Manipulate the config_pending semaphore.
       */
      void
      config_pending_incr(device_t dev)
      {
      
              mutex_enter(&config_misc_lock);
              config_pending++;
      #ifdef DEBUG_AUTOCONF
              printf("%s: %s %d\n", __func__, device_xname(dev), config_pending);
      #endif
              mutex_exit(&config_misc_lock);
      }
      
      void
      config_pending_decr(device_t dev)
      {
      
              KASSERT(0 < config_pending);
              mutex_enter(&config_misc_lock);
              config_pending--;
      #ifdef DEBUG_AUTOCONF
              printf("%s: %s %d\n", __func__, device_xname(dev), config_pending);
      #endif
              if (config_pending == 0)
                      cv_broadcast(&config_misc_cv);
              mutex_exit(&config_misc_lock);
      }
      
      /*
       * Register a "finalization" routine.  Finalization routines are
       * called iteratively once all real devices have been found during
       * autoconfiguration, for as long as any one finalizer has done
       * any work.
       */
      int
      config_finalize_register(device_t dev, int (*fn)(device_t))
      {
              struct finalize_hook *f;
      
              /*
               * If finalization has already been done, invoke the
               * callback function now.
               */
              if (config_finalize_done) {
                      while ((*fn)(dev) != 0)
                              /* loop */ ;
                      return 0;
              }
      
              /* Ensure this isn't already on the list. */
              TAILQ_FOREACH(f, &config_finalize_list, f_list) {
                      if (f->f_func == fn && f->f_dev == dev)
                              return EEXIST;
              }
      
              f = kmem_alloc(sizeof(*f), KM_SLEEP);
              f->f_func = fn;
              f->f_dev = dev;
              TAILQ_INSERT_TAIL(&config_finalize_list, f, f_list);
      
              return 0;
      }
      
      void
      config_finalize(void)
      {
              struct finalize_hook *f;
              struct pdevinit *pdev;
              extern struct pdevinit pdevinit[];
              int errcnt, rv;
      
              /*
               * Now that device driver threads have been created, wait for
               * them to finish any deferred autoconfiguration.
               */
              mutex_enter(&config_misc_lock);
              while (config_pending != 0)
                      cv_wait(&config_misc_cv, &config_misc_lock);
              mutex_exit(&config_misc_lock);
      
              KERNEL_LOCK(1, NULL);
      
              /* Attach pseudo-devices. */
              for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++)
                      (*pdev->pdev_attach)(pdev->pdev_count);
      
              /* Run the hooks until none of them does any work. */
              do {
                      rv = 0;
                      TAILQ_FOREACH(f, &config_finalize_list, f_list)
                              rv |= (*f->f_func)(f->f_dev);
              } while (rv != 0);
      
              config_finalize_done = 1;
      
              /* Now free all the hooks. */
              while ((f = TAILQ_FIRST(&config_finalize_list)) != NULL) {
                      TAILQ_REMOVE(&config_finalize_list, f, f_list);
                      kmem_free(f, sizeof(*f));
              }
      
              KERNEL_UNLOCK_ONE(NULL);
      
              errcnt = aprint_get_error_count();
              if ((boothowto & (AB_QUIET|AB_SILENT)) != 0 &&
                  (boothowto & AB_VERBOSE) == 0) {
                      mutex_enter(&config_misc_lock);
                      if (config_do_twiddle) {
                              config_do_twiddle = 0;
                              printf_nolog(" done.\n");
                      }
                      mutex_exit(&config_misc_lock);
              }
              if (errcnt != 0) {
                      printf("WARNING: %d error%s while detecting hardware; "
                          "check system log.\n", errcnt,
                          errcnt == 1 ? "" : "s");
              }
      }
      
      void
      config_twiddle_init(void)
      {
      
              if ((boothowto & (AB_SILENT|AB_VERBOSE)) == AB_SILENT) {
                      config_do_twiddle = 1;
              }
              callout_setfunc(&config_twiddle_ch, config_twiddle_fn, NULL);
      }
      
      void
      config_twiddle_fn(void *cookie)
      {
      
              mutex_enter(&config_misc_lock);
              if (config_do_twiddle) {
                      twiddle();
                      callout_schedule(&config_twiddle_ch, mstohz(100));
              }
              mutex_exit(&config_misc_lock);
      }
      
      static void
      config_alldevs_enter(struct alldevs_foray *af)
      {
              TAILQ_INIT(&af->af_garbage);
              mutex_enter(&alldevs_lock);
              config_collect_garbage(&af->af_garbage);
      }
      
      static void
      config_alldevs_exit(struct alldevs_foray *af)
      {
              mutex_exit(&alldevs_lock);
              config_dump_garbage(&af->af_garbage);
      }
      
      /*
       * device_lookup:
       *
       *        Look up a device instance for a given driver.
       */
      device_t
      device_lookup(cfdriver_t cd, int unit)
      {
              device_t dv;
      
  169         mutex_enter(&alldevs_lock);
  169         if (unit < 0 || unit >= cd->cd_ndevs)
                      dv = NULL;
  169         else if ((dv = cd->cd_devs[unit]) != NULL && dv->dv_del_gen != 0)
                      dv = NULL;
  169         mutex_exit(&alldevs_lock);
      
              return dv;
      }
      
      /*
       * device_lookup_private:
       *
       *        Look up a softc instance for a given driver.
       */
      void *
      device_lookup_private(cfdriver_t cd, int unit)
      {
      
  169         return device_private(device_lookup(cd, unit));
      }
      
      /*
       * device_find_by_xname:
       *
       *        Returns the device of the given name or NULL if it doesn't exist.
       */
      device_t
      device_find_by_xname(const char *name)
      {
              device_t dv;
              deviter_t di;
      
              for (dv = deviter_first(&di, 0); dv != NULL; dv = deviter_next(&di)) {
                      if (strcmp(device_xname(dv), name) == 0)
                              break;
              }
              deviter_release(&di);
      
              return dv;
      }
      
      /*
       * device_find_by_driver_unit:
       *
       *        Returns the device of the given driver name and unit or
       *        NULL if it doesn't exist.
       */
      device_t
      device_find_by_driver_unit(const char *name, int unit)
      {
              struct cfdriver *cd;
      
              if ((cd = config_cfdriver_lookup(name)) == NULL)
                      return NULL;
              return device_lookup(cd, unit);
      }
      
      /*
       * device_compatible_match:
       *
       *        Match a driver's "compatible" data against a device's
       *        "compatible" strings.  If a match is found, we return
       *        a weighted match result, and optionally the matching
       *        entry.
       */
      int
      device_compatible_match(const char **device_compats, int ndevice_compats,
                              const struct device_compatible_entry *driver_compats,
                              const struct device_compatible_entry **matching_entryp)
      {
              const struct device_compatible_entry *dce = NULL;
              int i, match_weight;
      
              if (ndevice_compats == 0 || device_compats == NULL ||
                  driver_compats == NULL)
                      return 0;
              
              /*
               * We take the first match because we start with the most-specific
               * device compatible string.
               */
              for (i = 0, match_weight = ndevice_compats - 1;
                   i < ndevice_compats;
                   i++, match_weight--) {
                      for (dce = driver_compats; dce->compat != NULL; dce++) {
                              if (strcmp(dce->compat, device_compats[i]) == 0) {
                                      KASSERT(match_weight >= 0);
                                      if (matching_entryp)
                                              *matching_entryp = dce;
                                      return 1 + match_weight;
                              }
                      }
              }
              return 0;
      }
      
      /*
       * Power management related functions.
       */
      
      bool
      device_pmf_is_registered(device_t dev)
      {
              return (dev->dv_flags & DVF_POWER_HANDLERS) != 0;
      }
      
      bool
      device_pmf_driver_suspend(device_t dev, const pmf_qual_t *qual)
      {
              if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
                      return true;
              if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
                      return false;
              if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
                  dev->dv_driver_suspend != NULL &&
                  !(*dev->dv_driver_suspend)(dev, qual))
                      return false;
      
              dev->dv_flags |= DVF_DRIVER_SUSPENDED;
              return true;
      }
      
      bool
      device_pmf_driver_resume(device_t dev, const pmf_qual_t *qual)
      {
              if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
                      return true;
              if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
                      return false;
              if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
                  dev->dv_driver_resume != NULL &&
                  !(*dev->dv_driver_resume)(dev, qual))
                      return false;
      
              dev->dv_flags &= ~DVF_DRIVER_SUSPENDED;
              return true;
      }
      
      bool
      device_pmf_driver_shutdown(device_t dev, int how)
      {
      
              if (*dev->dv_driver_shutdown != NULL &&
                  !(*dev->dv_driver_shutdown)(dev, how))
                      return false;
              return true;
      }
      
      bool
      device_pmf_driver_register(device_t dev,
          bool (*suspend)(device_t, const pmf_qual_t *),
          bool (*resume)(device_t, const pmf_qual_t *),
          bool (*shutdown)(device_t, int))
      {
              dev->dv_driver_suspend = suspend;
              dev->dv_driver_resume = resume;
              dev->dv_driver_shutdown = shutdown;
              dev->dv_flags |= DVF_POWER_HANDLERS;
              return true;
      }
      
      static const char *
      curlwp_name(void)
      {
              if (curlwp->l_name != NULL)
                      return curlwp->l_name;
              else
                      return curlwp->l_proc->p_comm;
      }
      
      void
      device_pmf_driver_deregister(device_t dev)
      {
              device_lock_t dvl = device_getlock(dev);
      
              dev->dv_driver_suspend = NULL;
              dev->dv_driver_resume = NULL;
      
              mutex_enter(&dvl->dvl_mtx);
              dev->dv_flags &= ~DVF_POWER_HANDLERS;
              while (dvl->dvl_nlock > 0 || dvl->dvl_nwait > 0) {
                      /* Wake a thread that waits for the lock.  That
                       * thread will fail to acquire the lock, and then
                       * it will wake the next thread that waits for the
                       * lock, or else it will wake us.
                       */
                      cv_signal(&dvl->dvl_cv);
                      pmflock_debug(dev, __func__, __LINE__);
                      cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
                      pmflock_debug(dev, __func__, __LINE__);
              }
              mutex_exit(&dvl->dvl_mtx);
      }
      
      bool
      device_pmf_driver_child_register(device_t dev)
      {
              device_t parent = device_parent(dev);
      
              if (parent == NULL || parent->dv_driver_child_register == NULL)
                      return true;
              return (*parent->dv_driver_child_register)(dev);
      }
      
      void
      device_pmf_driver_set_child_register(device_t dev,
          bool (*child_register)(device_t))
      {
              dev->dv_driver_child_register = child_register;
      }
      
      static void
      pmflock_debug(device_t dev, const char *func, int line)
      {
              device_lock_t dvl = device_getlock(dev);
      
              aprint_debug_dev(dev,
                  "%s.%d, %s dvl_nlock %d dvl_nwait %d dv_flags %x\n", func, line,
                  curlwp_name(), dvl->dvl_nlock, dvl->dvl_nwait, dev->dv_flags);
      }
      
      static bool
      device_pmf_lock1(device_t dev)
      {
              device_lock_t dvl = device_getlock(dev);
      
              while (device_pmf_is_registered(dev) &&
                  dvl->dvl_nlock > 0 && dvl->dvl_holder != curlwp) {
                      dvl->dvl_nwait++;
                      pmflock_debug(dev, __func__, __LINE__);
                      cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
                      pmflock_debug(dev, __func__, __LINE__);
                      dvl->dvl_nwait--;
              }
              if (!device_pmf_is_registered(dev)) {
                      pmflock_debug(dev, __func__, __LINE__);
                      /* We could not acquire the lock, but some other thread may
                       * wait for it, also.  Wake that thread.
                       */
                      cv_signal(&dvl->dvl_cv);
                      return false;
              }
              dvl->dvl_nlock++;
              dvl->dvl_holder = curlwp;
              pmflock_debug(dev, __func__, __LINE__);
              return true;
      }
      
      bool
      device_pmf_lock(device_t dev)
      {
              bool rc;
              device_lock_t dvl = device_getlock(dev);
      
              mutex_enter(&dvl->dvl_mtx);
              rc = device_pmf_lock1(dev);
              mutex_exit(&dvl->dvl_mtx);
      
              return rc;
      }
      
      void
      device_pmf_unlock(device_t dev)
      {
              device_lock_t dvl = device_getlock(dev);
      
              KASSERT(dvl->dvl_nlock > 0);
              mutex_enter(&dvl->dvl_mtx);
              if (--dvl->dvl_nlock == 0)
                      dvl->dvl_holder = NULL;
              cv_signal(&dvl->dvl_cv);
              pmflock_debug(dev, __func__, __LINE__);
              mutex_exit(&dvl->dvl_mtx);
      }
      
      device_lock_t
      device_getlock(device_t dev)
      {
              return &dev->dv_lock;
      }
      
      void *
      device_pmf_bus_private(device_t dev)
      {
              return dev->dv_bus_private;
      }
      
      bool
      device_pmf_bus_suspend(device_t dev, const pmf_qual_t *qual)
      {
              if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
                      return true;
              if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0 ||
                  (dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
                      return false;
              if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
                  dev->dv_bus_suspend != NULL &&
                  !(*dev->dv_bus_suspend)(dev, qual))
                      return false;
      
              dev->dv_flags |= DVF_BUS_SUSPENDED;
              return true;
      }
      
      bool
      device_pmf_bus_resume(device_t dev, const pmf_qual_t *qual)
      {
              if ((dev->dv_flags & DVF_BUS_SUSPENDED) == 0)
                      return true;
              if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
                  dev->dv_bus_resume != NULL &&
                  !(*dev->dv_bus_resume)(dev, qual))
                      return false;
      
              dev->dv_flags &= ~DVF_BUS_SUSPENDED;
              return true;
      }
      
      bool
      device_pmf_bus_shutdown(device_t dev, int how)
      {
      
              if (*dev->dv_bus_shutdown != NULL &&
                  !(*dev->dv_bus_shutdown)(dev, how))
                      return false;
              return true;
      }
      
      void
      device_pmf_bus_register(device_t dev, void *priv,
          bool (*suspend)(device_t, const pmf_qual_t *),
          bool (*resume)(device_t, const pmf_qual_t *),
          bool (*shutdown)(device_t, int), void (*deregister)(device_t))
      {
              dev->dv_bus_private = priv;
              dev->dv_bus_resume = resume;
              dev->dv_bus_suspend = suspend;
              dev->dv_bus_shutdown = shutdown;
              dev->dv_bus_deregister = deregister;
      }
      
      void
      device_pmf_bus_deregister(device_t dev)
      {
              if (dev->dv_bus_deregister == NULL)
                      return;
              (*dev->dv_bus_deregister)(dev);
              dev->dv_bus_private = NULL;
              dev->dv_bus_suspend = NULL;
              dev->dv_bus_resume = NULL;
              dev->dv_bus_deregister = NULL;
      }
      
      void *
      device_pmf_class_private(device_t dev)
      {
              return dev->dv_class_private;
      }
      
      bool
      device_pmf_class_suspend(device_t dev, const pmf_qual_t *qual)
      {
              if ((dev->dv_flags & DVF_CLASS_SUSPENDED) != 0)
                      return true;
              if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
                  dev->dv_class_suspend != NULL &&
                  !(*dev->dv_class_suspend)(dev, qual))
                      return false;
      
              dev->dv_flags |= DVF_CLASS_SUSPENDED;
              return true;
      }
      
      bool
      device_pmf_class_resume(device_t dev, const pmf_qual_t *qual)
      {
              if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
                      return true;
              if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0 ||
                  (dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
                      return false;
              if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
                  dev->dv_class_resume != NULL &&
                  !(*dev->dv_class_resume)(dev, qual))
                      return false;
      
              dev->dv_flags &= ~DVF_CLASS_SUSPENDED;
              return true;
      }
      
      void
      device_pmf_class_register(device_t dev, void *priv,
          bool (*suspend)(device_t, const pmf_qual_t *),
          bool (*resume)(device_t, const pmf_qual_t *),
          void (*deregister)(device_t))
      {
              dev->dv_class_private = priv;
              dev->dv_class_suspend = suspend;
              dev->dv_class_resume = resume;
              dev->dv_class_deregister = deregister;
      }
      
      void
      device_pmf_class_deregister(device_t dev)
      {
              if (dev->dv_class_deregister == NULL)
                      return;
              (*dev->dv_class_deregister)(dev);
              dev->dv_class_private = NULL;
              dev->dv_class_suspend = NULL;
              dev->dv_class_resume = NULL;
              dev->dv_class_deregister = NULL;
      }
      
      bool
      device_active(device_t dev, devactive_t type)
      {
              size_t i;
      
              if (dev->dv_activity_count == 0)
                      return false;
      
              for (i = 0; i < dev->dv_activity_count; ++i) {
                      if (dev->dv_activity_handlers[i] == NULL)
                              break;
                      (*dev->dv_activity_handlers[i])(dev, type);
              }
      
              return true;
      }
      
      bool
      device_active_register(device_t dev, void (*handler)(device_t, devactive_t))
      {
              void (**new_handlers)(device_t, devactive_t);
              void (**old_handlers)(device_t, devactive_t);
              size_t i, old_size, new_size;
              int s;
      
              old_handlers = dev->dv_activity_handlers;
              old_size = dev->dv_activity_count;
      
              KASSERT(old_size == 0 || old_handlers != NULL);
      
              for (i = 0; i < old_size; ++i) {
                      KASSERT(old_handlers[i] != handler);
                      if (old_handlers[i] == NULL) {
                              old_handlers[i] = handler;
                              return true;
                      }
              }
      
              new_size = old_size + 4;
              new_handlers = kmem_alloc(sizeof(void *[new_size]), KM_SLEEP);
      
              for (i = 0; i < old_size; ++i)
                      new_handlers[i] = old_handlers[i];
              new_handlers[old_size] = handler;
              for (i = old_size+1; i < new_size; ++i)
                      new_handlers[i] = NULL;
      
              s = splhigh();
              dev->dv_activity_count = new_size;
              dev->dv_activity_handlers = new_handlers;
              splx(s);
      
              if (old_size > 0)
                      kmem_free(old_handlers, sizeof(void * [old_size]));
      
              return true;
      }
      
      void
      device_active_deregister(device_t dev, void (*handler)(device_t, devactive_t))
      {
              void (**old_handlers)(device_t, devactive_t);
              size_t i, old_size;
              int s;
      
              old_handlers = dev->dv_activity_handlers;
              old_size = dev->dv_activity_count;
      
              for (i = 0; i < old_size; ++i) {
                      if (old_handlers[i] == handler)
                              break;
                      if (old_handlers[i] == NULL)
                              return; /* XXX panic? */
              }
      
              if (i == old_size)
                      return; /* XXX panic? */
      
              for (; i < old_size - 1; ++i) {
                      if ((old_handlers[i] = old_handlers[i + 1]) != NULL)
                              continue;
      
                      if (i == 0) {
                              s = splhigh();
                              dev->dv_activity_count = 0;
                              dev->dv_activity_handlers = NULL;
                              splx(s);
                              kmem_free(old_handlers, sizeof(void *[old_size]));
                      }
                      return;
              }
              old_handlers[i] = NULL;
      }
      
      /* Return true iff the device_t `dev' exists at generation `gen'. */
      static bool
      device_exists_at(device_t dv, devgen_t gen)
      {
              return (dv->dv_del_gen == 0 || dv->dv_del_gen > gen) &&
                  dv->dv_add_gen <= gen;
      }
      
      static bool
      deviter_visits(const deviter_t *di, device_t dv)
      {
              return device_exists_at(dv, di->di_gen);
      }
      
      /*
       * Device Iteration
       *
       * deviter_t: a device iterator.  Holds state for a "walk" visiting
       *     each device_t's in the device tree.
       *
       * deviter_init(di, flags): initialize the device iterator `di'
       *     to "walk" the device tree.  deviter_next(di) will return
       *     the first device_t in the device tree, or NULL if there are
       *     no devices.
       *
       *     `flags' is one or more of DEVITER_F_RW, indicating that the
       *     caller intends to modify the device tree by calling
       *     config_detach(9) on devices in the order that the iterator
       *     returns them; DEVITER_F_ROOT_FIRST, asking for the devices
       *     nearest the "root" of the device tree to be returned, first;
       *     DEVITER_F_LEAVES_FIRST, asking for the devices furthest from
       *     the root of the device tree, first; and DEVITER_F_SHUTDOWN,
       *     indicating both that deviter_init() should not respect any
       *     locks on the device tree, and that deviter_next(di) may run
       *     in more than one LWP before the walk has finished.
       *
       *     Only one DEVITER_F_RW iterator may be in the device tree at
       *     once.
       *
       *     DEVITER_F_SHUTDOWN implies DEVITER_F_RW.
       *
       *     Results are undefined if the flags DEVITER_F_ROOT_FIRST and
       *     DEVITER_F_LEAVES_FIRST are used in combination.
       *
       * deviter_first(di, flags): initialize the device iterator `di'
       *     and return the first device_t in the device tree, or NULL
       *     if there are no devices.  The statement
       *
       *         dv = deviter_first(di);
       *
       *     is shorthand for
       *
       *         deviter_init(di);
       *         dv = deviter_next(di);
       *
       * deviter_next(di): return the next device_t in the device tree,
       *     or NULL if there are no more devices.  deviter_next(di)
       *     is undefined if `di' was not initialized with deviter_init() or
       *     deviter_first().
       *
       * deviter_release(di): stops iteration (subsequent calls to
       *     deviter_next() will return NULL), releases any locks and
       *     resources held by the device iterator.
       *
       * Device iteration does not return device_t's in any particular
       * order.  An iterator will never return the same device_t twice.
       * Device iteration is guaranteed to complete---i.e., if deviter_next(di)
       * is called repeatedly on the same `di', it will eventually return
       * NULL.  It is ok to attach/detach devices during device iteration.
       */
      void
      deviter_init(deviter_t *di, deviter_flags_t flags)
      {
              device_t dv;
      
              memset(di, 0, sizeof(*di));
      
              if ((flags & DEVITER_F_SHUTDOWN) != 0)
                      flags |= DEVITER_F_RW;
      
              mutex_enter(&alldevs_lock);
              if ((flags & DEVITER_F_RW) != 0)
                      alldevs_nwrite++;
              else
                      alldevs_nread++;
              di->di_gen = alldevs_gen++;
              di->di_flags = flags;
      
              switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
              case DEVITER_F_LEAVES_FIRST:
                      TAILQ_FOREACH(dv, &alldevs, dv_list) {
                              if (!deviter_visits(di, dv))
                                      continue;
                              di->di_curdepth = MAX(di->di_curdepth, dv->dv_depth);
                      }
                      break;
              case DEVITER_F_ROOT_FIRST:
                      TAILQ_FOREACH(dv, &alldevs, dv_list) {
                              if (!deviter_visits(di, dv))
                                      continue;
                              di->di_maxdepth = MAX(di->di_maxdepth, dv->dv_depth);
                      }
                      break;
              default:
                      break;
              }
      
              deviter_reinit(di);
              mutex_exit(&alldevs_lock);
      }
      
      static void
      deviter_reinit(deviter_t *di)
      {
      
              KASSERT(mutex_owned(&alldevs_lock));
              if ((di->di_flags & DEVITER_F_RW) != 0)
                      di->di_prev = TAILQ_LAST(&alldevs, devicelist);
              else
                      di->di_prev = TAILQ_FIRST(&alldevs);
      }
      
      device_t
      deviter_first(deviter_t *di, deviter_flags_t flags)
      {
      
              deviter_init(di, flags);
              return deviter_next(di);
      }
      
      static device_t
      deviter_next2(deviter_t *di)
      {
              device_t dv;
      
              KASSERT(mutex_owned(&alldevs_lock));
      
              dv = di->di_prev;
      
              if (dv == NULL)
                      return NULL;
      
              if ((di->di_flags & DEVITER_F_RW) != 0)
                      di->di_prev = TAILQ_PREV(dv, devicelist, dv_list);
              else
                      di->di_prev = TAILQ_NEXT(dv, dv_list);
      
              return dv;
      }
      
      static device_t
      deviter_next1(deviter_t *di)
      {
              device_t dv;
      
              KASSERT(mutex_owned(&alldevs_lock));
      
              do {
                      dv = deviter_next2(di);
              } while (dv != NULL && !deviter_visits(di, dv));
      
              return dv;
      }
      
      device_t
      deviter_next(deviter_t *di)
      {
              device_t dv = NULL;
      
              mutex_enter(&alldevs_lock);
              switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
              case 0:
                      dv = deviter_next1(di);
                      break;
              case DEVITER_F_LEAVES_FIRST:
                      while (di->di_curdepth >= 0) {
                              if ((dv = deviter_next1(di)) == NULL) {
                                      di->di_curdepth--;
                                      deviter_reinit(di);
                              } else if (dv->dv_depth == di->di_curdepth)
                                      break;
                      }
                      break;
              case DEVITER_F_ROOT_FIRST:
                      while (di->di_curdepth <= di->di_maxdepth) {
                              if ((dv = deviter_next1(di)) == NULL) {
                                      di->di_curdepth++;
                                      deviter_reinit(di);
                              } else if (dv->dv_depth == di->di_curdepth)
                                      break;
                      }
                      break;
              default:
                      break;
              }
              mutex_exit(&alldevs_lock);
      
              return dv;
      }
      
      void
      deviter_release(deviter_t *di)
      {
              bool rw = (di->di_flags & DEVITER_F_RW) != 0;
      
              mutex_enter(&alldevs_lock);
              if (rw)
                      --alldevs_nwrite;
              else
                      --alldevs_nread;
              /* XXX wake a garbage-collection thread */
              mutex_exit(&alldevs_lock);
      }
      
      const char *
      cfdata_ifattr(const struct cfdata *cf)
      {
              return cf->cf_pspec->cfp_iattr;
      }
      
      bool
      ifattr_match(const char *snull, const char *t)
      {
              return (snull == NULL) || strcmp(snull, t) == 0;
      }
      
      void
      null_childdetached(device_t self, device_t child)
      {
              /* do nothing */
      }
      
      static void
      sysctl_detach_setup(struct sysctllog **clog)
      {
      
              sysctl_createv(clog, 0, NULL, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                      CTLTYPE_BOOL, "detachall",
                      SYSCTL_DESCR("Detach all devices at shutdown"),
                      NULL, 0, &detachall, 0,
                      CTL_KERN, CTL_CREATE, CTL_EOL);
      }
      /*        $NetBSD: uipc_domain.c,v 1.106 2018/12/27 07:56:43 maxv Exp $        */
      
      /*
       * Copyright (c) 1982, 1986, 1993
       *        The Regents of the University of California.  All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. Neither the name of the University nor the names of its contributors
       *    may be used to endorse or promote products derived from this software
       *    without specific prior written permission.
       *
       * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
       * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
       * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       * SUCH DAMAGE.
       *
       *        @(#)uipc_domain.c        8.3 (Berkeley) 2/14/95
       */
      
      #include <sys/cdefs.h>
      __KERNEL_RCSID(0, "$NetBSD: uipc_domain.c,v 1.106 2018/12/27 07:56:43 maxv Exp $");
      
      #include <sys/param.h>
      #include <sys/socket.h>
      #include <sys/socketvar.h>
      #include <sys/protosw.h>
      #include <sys/domain.h>
      #include <sys/mbuf.h>
      #include <sys/time.h>
      #include <sys/kernel.h>
      #include <sys/systm.h>
      #include <sys/callout.h>
      #include <sys/queue.h>
      #include <sys/proc.h>
      #include <sys/sysctl.h>
      #include <sys/un.h>
      #include <sys/unpcb.h>
      #include <sys/file.h>
      #include <sys/filedesc.h>
      #include <sys/kauth.h>
      
      #include <netatalk/at.h>
      #include <net/if_dl.h>
      #include <netinet/in.h>
      
      MALLOC_DECLARE(M_SOCKADDR);
      
      MALLOC_DEFINE(M_SOCKADDR, "sockaddr", "socket endpoints");
      
      void        pffasttimo(void *);
      void        pfslowtimo(void *);
      
      struct domainhead domains = STAILQ_HEAD_INITIALIZER(domains);
      static struct domain *domain_array[AF_MAX];
      
      callout_t pffasttimo_ch, pfslowtimo_ch;
      
      /*
       * Current time values for fast and slow timeouts.  We can use u_int
       * relatively safely.  The fast timer will roll over in 27 years and
       * the slow timer in 68 years.
       */
      u_int        pfslowtimo_now;
      u_int        pffasttimo_now;
      
      static struct sysctllog *domain_sysctllog;
      static void sysctl_net_setup(void);
      
      /* ensure successful linkage even without any domains in link sets */
      static struct domain domain_dummy;
      __link_set_add_rodata(domains,domain_dummy);
      
      static void
      domain_init_timers(void)
      {
      
              callout_init(&pffasttimo_ch, CALLOUT_MPSAFE);
              callout_init(&pfslowtimo_ch, CALLOUT_MPSAFE);
      
              callout_reset(&pffasttimo_ch, 1, pffasttimo, NULL);
              callout_reset(&pfslowtimo_ch, 1, pfslowtimo, NULL);
      }
      
      void
      domaininit(bool attach)
      {
              __link_set_decl(domains, struct domain);
              struct domain * const * dpp;
              struct domain *rt_domain = NULL;
      
              sysctl_net_setup();
      
              /*
               * Add all of the domains.  Make sure the PF_ROUTE
               * domain is added last.
               */
              if (attach) {
                      __link_set_foreach(dpp, domains) {
                              if (*dpp == &domain_dummy)
                                      continue;
                              if ((*dpp)->dom_family == PF_ROUTE)
                                      rt_domain = *dpp;
                              else
                                      domain_attach(*dpp);
                      }
                      if (rt_domain)
                              domain_attach(rt_domain);
      
                      domain_init_timers();
              }
      }
      
      /*
       * Must be called only if domaininit has been called with false and
       * after all domains have been attached.
       */
      void
      domaininit_post(void)
      {
      
              domain_init_timers();
      }
      
      void
      domain_attach(struct domain *dp)
      {
              const struct protosw *pr;
      
              STAILQ_INSERT_TAIL(&domains, dp, dom_link);
              if (dp->dom_family < __arraycount(domain_array))
                      domain_array[dp->dom_family] = dp;
      
              if (dp->dom_init)
                      (*dp->dom_init)();
      
      #ifdef MBUFTRACE
              if (dp->dom_mowner.mo_name[0] == '\0') {
                      strncpy(dp->dom_mowner.mo_name, dp->dom_name,
                          sizeof(dp->dom_mowner.mo_name));
                      MOWNER_ATTACH(&dp->dom_mowner);
              }
      #endif
              for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                      if (pr->pr_init)
                              (*pr->pr_init)();
              }
      
              if (max_linkhdr < 16)                /* XXX */
                      max_linkhdr = 16;
              max_hdr = max_linkhdr + max_protohdr;
              max_datalen = MHLEN - max_hdr;
      }
      
      struct domain *
      pffinddomain(int family)
      {
              struct domain *dp;
      
  207         if (family < __arraycount(domain_array) && domain_array[family] != NULL)
                      return domain_array[family];
      
   10         DOMAIN_FOREACH(dp)
   10                 if (dp->dom_family == family)
                              return dp;
   10         return NULL;
      }
      
      const struct protosw *
      pffindtype(int family, int type)
      {
              struct domain *dp;
              const struct protosw *pr;
      
  136         dp = pffinddomain(family);
              if (dp == NULL)
  136                 return NULL;
      
  130         for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
  130                 if (pr->pr_type && pr->pr_type == type)
                              return pr;
      
              return NULL;
      }
      
      const struct protosw *
      pffindproto(int family, int protocol, int type)
      {
              struct domain *dp;
              const struct protosw *pr;
              const struct protosw *maybe = NULL;
      
   19         if (family == 0)
                      return NULL;
      
   18         dp = pffinddomain(family);
              if (dp == NULL)
                      return NULL;
      
   17         for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
   19                 if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
                              return pr;
      
   17                 if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
    9                     pr->pr_protocol == 0 && maybe == NULL)
                              maybe = pr;
              }
              return maybe;
      }
      
      void *
      sockaddr_addr(struct sockaddr *sa, socklen_t *slenp)
      {
              const struct domain *dom;
      
              if ((dom = pffinddomain(sa->sa_family)) == NULL ||
                  dom->dom_sockaddr_addr == NULL)
                      return NULL;
      
              return (*dom->dom_sockaddr_addr)(sa, slenp);
      }
      
      const void *
      sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp)
      {
              const struct domain *dom;
      
              if ((dom = pffinddomain(sa->sa_family)) == NULL ||
                  dom->dom_sockaddr_const_addr == NULL)
                      return NULL;
      
              return (*dom->dom_sockaddr_const_addr)(sa, slenp);
      }
      
      const struct sockaddr *
      sockaddr_any_by_family(sa_family_t family)
      {
              const struct domain *dom;
      
              if ((dom = pffinddomain(family)) == NULL)
                      return NULL;
      
              return dom->dom_sa_any;
      }
      
      const struct sockaddr *
      sockaddr_any(const struct sockaddr *sa)
      {
              return sockaddr_any_by_family(sa->sa_family);
      }
      
      const void *
      sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp)
      {
              const struct sockaddr *any;
      
              if ((any = sockaddr_any(sa)) == NULL)
                      return NULL;
      
              return sockaddr_const_addr(any, slenp);
      }
      
      socklen_t
      sockaddr_getsize_by_family(sa_family_t af)
      {
   76         switch (af) {
              case AF_INET:
   34                 return sizeof(struct sockaddr_in);
              case AF_INET6:
   43                 return sizeof(struct sockaddr_in6);
              case AF_UNIX:
                      return sizeof(struct sockaddr_un);
              case AF_LINK:
                      return sizeof(struct sockaddr_dl);
              case AF_APPLETALK:
                      return sizeof(struct sockaddr_at);
              default:
      #ifdef DIAGNOSTIC
                      printf("%s: (%s:%u:%u) Unhandled address family=%hhu\n",
                          __func__, curlwp->l_proc->p_comm,
                          curlwp->l_proc->p_pid, curlwp->l_lid, af);
      #endif
                      return 0;
              }
      }
      
      #ifdef DIAGNOSTIC
      static void
      sockaddr_checklen(const struct sockaddr *sa)
      {
              // Can't tell how much was allocated, if it was allocated.
   76         if (sa->sa_family == AF_LINK)
   76                 return;
      
   76         socklen_t len = sockaddr_getsize_by_family(sa->sa_family);
   76         if (len == 0 || len == sa->sa_len)
                      return;
      
              char buf[512];
              sockaddr_format(sa, buf, sizeof(buf));
              printf("%s: %p bad len af=%hhu socklen=%hhu len=%u [%s]\n",
                  __func__, sa, sa->sa_family, sa->sa_len, (unsigned)len, buf);
      }
      #else
      #define sockaddr_checklen(sa) ((void)0)
      #endif
      
      struct sockaddr *
      sockaddr_alloc(sa_family_t af, socklen_t socklen, int flags)
      {
              struct sockaddr *sa;
   76         socklen_t reallen = MAX(socklen, offsetof(struct sockaddr, sa_data[0]));
      
              if ((sa = malloc(reallen, M_SOCKADDR, flags)) == NULL)
                      return NULL;
      
   76         sa->sa_family = af;
              sa->sa_len = reallen;
              sockaddr_checklen(sa);
   76         return sa;
      }
      
      struct sockaddr *
      sockaddr_copy(struct sockaddr *dst, socklen_t socklen,
          const struct sockaddr *src)
      {
   76         if (__predict_false(socklen < src->sa_len)) {
                      panic("%s: source too long, %d < %d bytes", __func__, socklen,
                          src->sa_len);
              }
   76         sockaddr_checklen(src);
              return memcpy(dst, src, src->sa_len);
      }
      
      struct sockaddr *
      sockaddr_externalize(struct sockaddr *dst, socklen_t socklen,
          const struct sockaddr *src)
      {
              struct domain *dom;
      
              dom = pffinddomain(src->sa_family);
      
              if (dom != NULL && dom->dom_sockaddr_externalize != NULL)
                      return (*dom->dom_sockaddr_externalize)(dst, socklen, src);
      
              return sockaddr_copy(dst, socklen, src);
      }
      
      int
      sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2)
      {
              int len, rc;
              struct domain *dom;
      
   62         if (sa1->sa_family != sa2->sa_family)
   62                 return sa1->sa_family - sa2->sa_family;
      
   62         dom = pffinddomain(sa1->sa_family);
      
   62         if (dom != NULL && dom->dom_sockaddr_cmp != NULL)
                      return (*dom->dom_sockaddr_cmp)(sa1, sa2);
      
   62         len = MIN(sa1->sa_len, sa2->sa_len);
      
              if (dom == NULL || dom->dom_sa_cmplen == 0) {
                      if ((rc = memcmp(sa1, sa2, len)) != 0)
                              return rc;
                      return sa1->sa_len - sa2->sa_len;
              }
      
   62         if ((rc = memcmp((const char *)sa1 + dom->dom_sa_cmpofs,
                               (const char *)sa2 + dom->dom_sa_cmpofs,
                               MIN(dom->dom_sa_cmplen,
                                   len - MIN(len, dom->dom_sa_cmpofs)))) != 0)
                      return rc;
      
   49         return MIN(dom->dom_sa_cmplen + dom->dom_sa_cmpofs, sa1->sa_len) -
                     MIN(dom->dom_sa_cmplen + dom->dom_sa_cmpofs, sa2->sa_len);
      }
      
      struct sockaddr *
      sockaddr_dup(const struct sockaddr *src, int flags)
      {
              struct sockaddr *dst;
      
   76         if ((dst = sockaddr_alloc(src->sa_family, src->sa_len, flags)) == NULL)
                      return NULL;
      
   76         return sockaddr_copy(dst, dst->sa_len, src);
      }
      
      void
      sockaddr_free(struct sockaddr *sa)
      {
   28         free(sa, M_SOCKADDR);
      }
      
      static int
      sun_print(char *buf, size_t len, const void *v)
      {
              const struct sockaddr_un *sun = v;
              return snprintf(buf, len, "%s", sun->sun_path);
      }
      
      int
      sockaddr_format(const struct sockaddr *sa, char *buf, size_t len)
      {
              size_t plen = 0;
      
              if (sa == NULL)
                      return strlcpy(buf, "(null)", len);
      
              switch (sa->sa_family) {
              case AF_LOCAL:
                      plen = strlcpy(buf, "unix: ", len);
                      break;
              case AF_INET:
                      plen = strlcpy(buf, "inet: ", len);
                      break;
              case AF_INET6:
                      plen = strlcpy(buf, "inet6: ", len);
                      break;
              case AF_LINK:
                      plen = strlcpy(buf, "link: ", len);
                      break;
              case AF_APPLETALK:
                      plen = strlcpy(buf, "atalk: ", len);
                      break;
              default:
                      return snprintf(buf, len, "(unknown socket family %d)",
                          (int)sa->sa_family);
              }
      
              buf += plen;
              if (plen > len)
                      len = 0;
              else
                      len -= plen;
      
              switch (sa->sa_family) {
              case AF_LOCAL:
                      return sun_print(buf, len, sa);
              case AF_INET:
                      return sin_print(buf, len, sa);
              case AF_INET6:
                      return sin6_print(buf, len, sa);
              case AF_LINK:
                      return sdl_print(buf, len, sa);
              case AF_APPLETALK:
                      return sat_print(buf, len, sa);
              default:
                      panic("bad family %hhu", sa->sa_family);
              }
      }
      
      /*
       * sysctl helper to stuff PF_LOCAL pcbs into sysctl structures
       */
      static void
      sysctl_dounpcb(struct kinfo_pcb *pcb, const struct socket *so)
      {
              const bool allowaddr = get_expose_address(curproc);
              struct unpcb *unp = sotounpcb(so);
              struct sockaddr_un *un = unp->unp_addr;
      
              memset(pcb, 0, sizeof(*pcb));
      
              pcb->ki_family = so->so_proto->pr_domain->dom_family;
              pcb->ki_type = so->so_proto->pr_type;
              pcb->ki_protocol = so->so_proto->pr_protocol;
              pcb->ki_pflags = unp->unp_flags;
      
              COND_SET_VALUE(pcb->ki_pcbaddr, PTRTOUINT64(unp), allowaddr);
              /* pcb->ki_ppcbaddr = unp has no ppcb... */
              COND_SET_VALUE(pcb->ki_sockaddr, PTRTOUINT64(so), allowaddr);
      
              pcb->ki_sostate = so->so_state;
              /* pcb->ki_prstate = unp has no state... */
      
              pcb->ki_rcvq = so->so_rcv.sb_cc;
              pcb->ki_sndq = so->so_snd.sb_cc;
      
              un = (struct sockaddr_un *)pcb->ki_spad;
              /*
               * local domain sockets may bind without having a local
               * endpoint.  bleah!
               */
              if (unp->unp_addr != NULL) {
                      /*
                       * We've added one to sun_len when allocating to
                       * hold terminating NUL which we want here.  See
                       * makeun().
                       */
                      memcpy(un, unp->unp_addr,
                          uimin(sizeof(pcb->ki_spad), unp->unp_addr->sun_len + 1));
              }
              else {
                      un->sun_len = offsetof(struct sockaddr_un, sun_path);
                      un->sun_family = pcb->ki_family;
              }
              if (unp->unp_conn != NULL) {
                      un = (struct sockaddr_un *)pcb->ki_dpad;
                      if (unp->unp_conn->unp_addr != NULL) {
                              memcpy(un, unp->unp_conn->unp_addr,
                                  uimin(sizeof(pcb->ki_dpad), unp->unp_conn->unp_addr->sun_len + 1));
                      }
                      else {
                              un->sun_len = offsetof(struct sockaddr_un, sun_path);
                              un->sun_family = pcb->ki_family;
                      }
              }
      
              pcb->ki_inode = unp->unp_ino;
              COND_SET_VALUE(pcb->ki_vnode, PTRTOUINT64(unp->unp_vnode), allowaddr);
              COND_SET_VALUE(pcb->ki_conn, PTRTOUINT64(unp->unp_conn), allowaddr);
              COND_SET_VALUE(pcb->ki_refs, PTRTOUINT64(unp->unp_refs), allowaddr);
              COND_SET_VALUE(pcb->ki_nextref, PTRTOUINT64(unp->unp_nextref),
                  allowaddr);
      }
      
      static int
      sysctl_unpcblist(SYSCTLFN_ARGS)
      {
              struct file *fp, *np, *dfp;
              struct socket *so;
              struct kinfo_pcb pcb;
              char *dp;
              size_t len, needed, elem_size, out_size;
              int error, elem_count, pf, type;
      
              if (namelen == 1 && name[0] == CTL_QUERY)
                      return sysctl_query(SYSCTLFN_CALL(rnode));
      
              if (namelen != 4)
                      return EINVAL;
      
              if (oldp != NULL) {
                      len = *oldlenp;
                      elem_size = name[2];
                      elem_count = name[3];
                      if (elem_size != sizeof(pcb))
                              return EINVAL;
              } else {
                      len = 0;
                      elem_size = sizeof(pcb);
                      elem_count = INT_MAX;
              }
              error = 0;
              dp = oldp;
              out_size = elem_size;
              needed = 0;
      
              if (name - oname != 4)
                      return EINVAL;
      
              pf = oname[1];
              type = oname[2];
      
              /*
               * allocate dummy file descriptor to make position in list.
               */
              sysctl_unlock();
              if ((dfp = fgetdummy()) == NULL) {
                       sysctl_relock();
                      return ENOMEM;
              }
      
              /*
               * there's no "list" of local domain sockets, so we have
               * to walk the file list looking for them.  :-/
               */
              mutex_enter(&filelist_lock);
              LIST_FOREACH_SAFE(fp, &filehead, f_list, np) {
                      if (fp->f_count == 0 || fp->f_type != DTYPE_SOCKET ||
                          fp->f_socket == NULL)
                              continue;
                      so = fp->f_socket;
                      if (so->so_type != type)
                              continue;
                      if (so->so_proto->pr_domain->dom_family != pf)
                              continue;
                      if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
                          KAUTH_REQ_NETWORK_SOCKET_CANSEE, so, NULL, NULL) != 0)
                              continue;
                      if (len >= elem_size && elem_count > 0) {
                              mutex_enter(&fp->f_lock);
                              /*
                               * Do not add references, if the count reached 0.
                               * Since the check above has been performed without
                               * locking, it must be rechecked here as a concurrent
                               * closef could have reduced it.
                               */
                              if (fp->f_count == 0) {
                                      mutex_exit(&fp->f_lock);
                                      continue;
                              }
                              fp->f_count++;
                              mutex_exit(&fp->f_lock);
                              LIST_INSERT_AFTER(fp, dfp, f_list);
                              mutex_exit(&filelist_lock);
                              sysctl_dounpcb(&pcb, so);
                              error = copyout(&pcb, dp, out_size);
                              closef(fp);
                              mutex_enter(&filelist_lock);
                              np = LIST_NEXT(dfp, f_list);
                              LIST_REMOVE(dfp, f_list);
                              if (error)
                                      break;
                              dp += elem_size;
                              len -= elem_size;
                      }
                      needed += elem_size;
                      if (elem_count > 0 && elem_count != INT_MAX)
                              elem_count--;
              }
              mutex_exit(&filelist_lock);
              fputdummy(dfp);
               *oldlenp = needed;
              if (oldp == NULL)
                      *oldlenp += PCB_SLOP * sizeof(struct kinfo_pcb);
               sysctl_relock();
      
              return error;
      }
      
      static void
      sysctl_net_setup(void)
      {
      
              KASSERT(domain_sysctllog == NULL);
              sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_NODE, "local",
                             SYSCTL_DESCR("PF_LOCAL related settings"),
                             NULL, 0, NULL, 0,
                             CTL_NET, PF_LOCAL, CTL_EOL);
              sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_NODE, "stream",
                             SYSCTL_DESCR("SOCK_STREAM settings"),
                             NULL, 0, NULL, 0,
                             CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_EOL);
              sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_NODE, "seqpacket",
                             SYSCTL_DESCR("SOCK_SEQPACKET settings"),
                             NULL, 0, NULL, 0,
                             CTL_NET, PF_LOCAL, SOCK_SEQPACKET, CTL_EOL);
              sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_NODE, "dgram",
                             SYSCTL_DESCR("SOCK_DGRAM settings"),
                             NULL, 0, NULL, 0,
                             CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_EOL);
      
              sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_STRUCT, "pcblist",
                             SYSCTL_DESCR("SOCK_STREAM protocol control block list"),
                             sysctl_unpcblist, 0, NULL, 0,
                             CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
              sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_STRUCT, "pcblist",
                             SYSCTL_DESCR("SOCK_SEQPACKET protocol control "
                                          "block list"),
                             sysctl_unpcblist, 0, NULL, 0,
                             CTL_NET, PF_LOCAL, SOCK_SEQPACKET, CTL_CREATE, CTL_EOL);
              sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                             CTLFLAG_PERMANENT,
                             CTLTYPE_STRUCT, "pcblist",
                             SYSCTL_DESCR("SOCK_DGRAM protocol control block list"),
                             sysctl_unpcblist, 0, NULL, 0,
                             CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
              unp_sysctl_create(&domain_sysctllog);
      }
      
      void
      pfctlinput(int cmd, const struct sockaddr *sa)
      {
              struct domain *dp;
              const struct protosw *pr;
      
              DOMAIN_FOREACH(dp) {
                      for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                              if (pr->pr_ctlinput != NULL)
                                      (*pr->pr_ctlinput)(cmd, sa, NULL);
                      }
              }
      }
      
      void
      pfctlinput2(int cmd, const struct sockaddr *sa, void *ctlparam)
      {
              struct domain *dp;
              const struct protosw *pr;
      
              if (sa == NULL)
                      return;
      
              DOMAIN_FOREACH(dp) {
                      /*
                       * the check must be made by xx_ctlinput() anyways, to
                       * make sure we use data item pointed to by ctlparam in
                       * correct way.  the following check is made just for safety.
                       */
                      if (dp->dom_family != sa->sa_family)
                              continue;
      
                      for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                              if (pr->pr_ctlinput != NULL)
                                      (*pr->pr_ctlinput)(cmd, sa, ctlparam);
                      }
              }
      }
      
      void
      pfslowtimo(void *arg)
      {
              struct domain *dp;
              const struct protosw *pr;
      
              pfslowtimo_now++;
      
              DOMAIN_FOREACH(dp) {
                      for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                              if (pr->pr_slowtimo)
                                      (*pr->pr_slowtimo)();
              }
              callout_schedule(&pfslowtimo_ch, hz / PR_SLOWHZ);
      }
      
      void
      pffasttimo(void *arg)
      {
              struct domain *dp;
              const struct protosw *pr;
      
              pffasttimo_now++;
      
              DOMAIN_FOREACH(dp) {
                      for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                              if (pr->pr_fasttimo)
                                      (*pr->pr_fasttimo)();
              }
              callout_schedule(&pffasttimo_ch, hz / PR_FASTHZ);
      }
      /*        $NetBSD: pmap.h,v 1.105 2019/11/14 16:23:52 maxv Exp $        */
      
      /*
       * Copyright (c) 1997 Charles D. Cranor and Washington University.
       * All rights reserved.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       *
       * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
       * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
       * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
       * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
       * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
       * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       */
      
      /*
       * Copyright (c) 2001 Wasabi Systems, Inc.
       * All rights reserved.
       *
       * Written by Frank van der Linden for Wasabi Systems, Inc.
       *
       * Redistribution and use in source and binary forms, with or without
       * modification, are permitted provided that the following conditions
       * are met:
       * 1. Redistributions of source code must retain the above copyright
       *    notice, this list of conditions and the following disclaimer.
       * 2. Redistributions in binary form must reproduce the above copyright
       *    notice, this list of conditions and the following disclaimer in the
       *    documentation and/or other materials provided with the distribution.
       * 3. All advertising materials mentioning features or use of this software
       *    must display the following acknowledgement:
       *      This product includes software developed for the NetBSD Project by
       *      Wasabi Systems, Inc.
       * 4. The name of Wasabi Systems, Inc. may not be used to endorse
       *    or promote products derived from this software without specific prior