712 707 1 2 1 704 357 1 3 343 353 344 388 220 93 189 2 27 14 7 6 8 18 11 9 18 10 5 8 8 2 8 4 6 2 6 4 7 267 191 8 3 3 3 3 3 3 3 3 3 3 3 1 3 3 6 2 2 3 3 3 3 3 2 3 2 3 3 3 2 1 1 3 3 69 2 3 3 3 3 3 3 3 3 11 2 2 3 3 2 3 3 3 3 3 3 3 4 4 29 48 26 105 54 76 47 13 59 3 25 2 47 3 30 31 10 7 13 7 1 4 4 4 10 9 5 5 1 2 1 9 7 41 16 12 9 9 6 1 7 52 83 54 16 18 2 4 12 2 1 2 1 1 1 1 1 1 1 3 3 2 1 3 1 3 3 3 2 2 2 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 7 6 1 26 27 7 12 9 19 4 2 2 2 2 4 2 3 14 12 2 2 19 8 11 12 9 15 2 2 2 2 1 1 3 2 1 1 1 1 2 2 2 7 2 2 1 1 1 6 2 1 2 1 1 8 2 1 1 2 1 1 1 6 3 3 10 2 4 2 2 2 11 2 2 3 4 2 2 4 2 2 2 2 2 2 2 3 5 3 3 3 3 /* $OpenBSD: kern_sysctl.c,v 1.389 2021/02/08 10:51:02 mpi Exp $ */ /* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ /* * sysctl system call. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/signalvar.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/vnode.h> #include <sys/unistd.h> #include <sys/buf.h> #include <sys/ioctl.h> #include <sys/tty.h> #include <sys/disklabel.h> #include <sys/disk.h> #include <sys/sysctl.h> #include <sys/msgbuf.h> #include <sys/vmmeter.h> #include <sys/namei.h> #include <sys/exec.h> #include <sys/mbuf.h> #include <sys/percpu.h> #include <sys/sensors.h> #include <sys/pipe.h> #include <sys/eventvar.h> #include <sys/socketvar.h> #include <sys/socket.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/pledge.h> #include <sys/timetc.h> #include <sys/evcount.h> #include <sys/un.h> #include <sys/unpcb.h> #include <sys/sched.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/wait.h> #include <sys/witness.h> #include <uvm/uvm_extern.h> #include <dev/cons.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/in_pcb.h> #include <netinet/ip6.h> #include <netinet/tcp.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/udp.h> #include <netinet/udp_var.h> #include <netinet6/ip6_var.h> #ifdef DDB #include <ddb/db_var.h> #endif #ifdef SYSVMSG #include <sys/msg.h> #endif #ifdef SYSVSEM #include <sys/sem.h> #endif #ifdef SYSVSHM #include <sys/shm.h> #endif #include "audio.h" #include "video.h" #include "pf.h" extern struct forkstat forkstat; extern struct nchstats nchstats; extern int nselcoll, fscale; extern struct disklist_head disklist; extern fixpt_t ccpu; extern long numvnodes; #if NAUDIO > 0 extern int audio_record_enable; #endif #if NVIDEO > 0 extern int video_record_enable; #endif int allowkmem; int allowdt; int sysctl_diskinit(int, struct proc *); int sysctl_proc_args(int *, u_int, void *, size_t *, struct proc *); int sysctl_proc_cwd(int *, u_int, void *, size_t *, struct proc *); int sysctl_proc_nobroadcastkill(int *, u_int, void *, size_t, void *, size_t *, struct proc *); int sysctl_proc_vmmap(int *, u_int, void *, size_t *, struct proc *); int sysctl_intrcnt(int *, u_int, void *, size_t *); int sysctl_sensors(int *, u_int, void *, size_t *, void *, size_t); int sysctl_cptime2(int *, u_int, void *, size_t *, void *, size_t); #if NAUDIO > 0 int sysctl_audio(int *, u_int, void *, size_t *, void *, size_t); #endif #if NVIDEO > 0 int sysctl_video(int *, u_int, void *, size_t *, void *, size_t); #endif int sysctl_cpustats(int *, u_int, void *, size_t *, void *, size_t); int sysctl_utc_offset(void *, size_t *, void *, size_t); void fill_file(struct kinfo_file *, struct file *, struct filedesc *, int, struct vnode *, struct process *, struct proc *, struct socket *, int); void fill_kproc(struct process *, struct kinfo_proc *, struct proc *, int); int (*cpu_cpuspeed)(int *); /* * Lock to avoid too many processes vslocking a large amount of memory * at the same time. */ struct rwlock sysctl_lock = RWLOCK_INITIALIZER("sysctllk"); struct rwlock sysctl_disklock = RWLOCK_INITIALIZER("sysctldlk"); int sys_sysctl(struct proc *p, void *v, register_t *retval) { struct sys_sysctl_args /* { syscallarg(const int *) name; syscallarg(u_int) namelen; syscallarg(void *) old; syscallarg(size_t *) oldlenp; syscallarg(void *) new; syscallarg(size_t) newlen; } */ *uap = v; int error, dolock = 1; size_t savelen = 0, oldlen = 0; sysctlfn *fn; int name[CTL_MAXNAME]; if (SCARG(uap, new) != NULL && (error = suser(p))) return (error); /* * all top-level sysctl names are non-terminal */ if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 2) return (EINVAL); error = copyin(SCARG(uap, name), name, SCARG(uap, namelen) * sizeof(int)); if (error) return (error); error = pledge_sysctl(p, SCARG(uap, namelen), name, SCARG(uap, new)); if (error) return (error); switch (name[0]) { case CTL_KERN: fn = kern_sysctl; break; case CTL_HW: fn = hw_sysctl; break; case CTL_VM: fn = uvm_sysctl; break; case CTL_NET: fn = net_sysctl; break; case CTL_FS: fn = fs_sysctl; break; case CTL_VFS: fn = vfs_sysctl; break; case CTL_MACHDEP: fn = cpu_sysctl; break; #ifdef DEBUG_SYSCTL case CTL_DEBUG: fn = debug_sysctl; break; #endif #ifdef DDB case CTL_DDB: fn = ddb_sysctl; break; #endif default: return (EOPNOTSUPP); } if (SCARG(uap, oldlenp) && (error = copyin(SCARG(uap, oldlenp), &oldlen, sizeof(oldlen)))) return (error); if (SCARG(uap, old) != NULL) { if ((error = rw_enter(&sysctl_lock, RW_WRITE|RW_INTR)) != 0) return (error); if (dolock) { if (atop(oldlen) > uvmexp.wiredmax - uvmexp.wired) { rw_exit_write(&sysctl_lock); return (ENOMEM); } error = uvm_vslock(p, SCARG(uap, old), oldlen, PROT_READ | PROT_WRITE); if (error) { rw_exit_write(&sysctl_lock); return (error); } } savelen = oldlen; } error = (*fn)(&name[1], SCARG(uap, namelen) - 1, SCARG(uap, old), &oldlen, SCARG(uap, new), SCARG(uap, newlen), p); if (SCARG(uap, old) != NULL) { if (dolock) uvm_vsunlock(p, SCARG(uap, old), savelen); rw_exit_write(&sysctl_lock); } if (error) return (error); if (SCARG(uap, oldlenp)) error = copyout(&oldlen, SCARG(uap, oldlenp), sizeof(oldlen)); return (error); } /* * Attributes stored in the kernel. */ char hostname[MAXHOSTNAMELEN]; int hostnamelen; char domainname[MAXHOSTNAMELEN]; int domainnamelen; long hostid; char *disknames = NULL; size_t disknameslen; struct diskstats *diskstats = NULL; size_t diskstatslen; int securelevel; /* morally const values reported by sysctl_bounded_arr */ static int arg_max = ARG_MAX; static int openbsd = OpenBSD; static int posix_version = _POSIX_VERSION; static int ngroups_max = NGROUPS_MAX; static int int_zero = 0; static int int_one = 1; static int maxpartitions = MAXPARTITIONS; static int raw_part = RAW_PART; extern int somaxconn, sominconn; extern int nosuidcoredump; extern int maxlocksperuid; extern int uvm_wxabort; extern int global_ptrace; const struct sysctl_bounded_args kern_vars[] = { {KERN_OSREV, &openbsd, 1, 0}, {KERN_MAXVNODES, &maxvnodes, 0, INT_MAX}, {KERN_MAXPROC, &maxprocess, 0, INT_MAX}, {KERN_MAXFILES, &maxfiles, 0, INT_MAX}, {KERN_NFILES, &numfiles, 1, 0}, {KERN_TTYCOUNT, &tty_count, 1, 0}, {KERN_ARGMAX, &arg_max, 1, 0}, {KERN_NSELCOLL, &nselcoll, 1, 0}, {KERN_POSIX1, &posix_version, 1, 0}, {KERN_NGROUPS, &ngroups_max, 1, 0}, {KERN_JOB_CONTROL, &int_one, 1, 0}, {KERN_SAVED_IDS, &int_one, 1, 0}, {KERN_MAXPARTITIONS, &maxpartitions, 1, 0}, {KERN_RAWPARTITION, &raw_part, 1, 0}, {KERN_MAXTHREAD, &maxthread, 0, INT_MAX}, {KERN_NTHREADS, &nthreads, 1, 0}, {KERN_SOMAXCONN, &somaxconn, 0, SHRT_MAX}, {KERN_SOMINCONN, &sominconn, 0, SHRT_MAX}, {KERN_NOSUIDCOREDUMP, &nosuidcoredump, 0, 3}, {KERN_FSYNC, &int_one, 1, 0}, {KERN_SYSVMSG, #ifdef SYSVMSG &int_one, #else &int_zero, #endif 1, 0}, {KERN_SYSVSEM, #ifdef SYSVSEM &int_one, #else &int_zero, #endif 1, 0}, {KERN_SYSVSHM, #ifdef SYSVSHM &int_one, #else &int_zero, #endif 1, 0}, {KERN_FSCALE, &fscale, 1, 0}, {KERN_CCPU, &ccpu, 1, 0}, {KERN_NPROCS, &nprocesses, 1, 0}, {KERN_SPLASSERT, &splassert_ctl, 0, 3}, {KERN_MAXLOCKSPERUID, &maxlocksperuid, 0, INT_MAX}, {KERN_WXABORT, &uvm_wxabort, 0, 1}, {KERN_NETLIVELOCKS, &int_zero, 1, 0}, #ifdef PTRACE {KERN_GLOBAL_PTRACE, &global_ptrace, 0, 1}, #endif }; int kern_sysctl_dirs(int top_name, int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct proc *p) { switch (top_name) { #ifndef SMALL_KERNEL case KERN_PROC: return (sysctl_doproc(name, namelen, oldp, oldlenp)); case KERN_PROC_ARGS: return (sysctl_proc_args(name, namelen, oldp, oldlenp, p)); case KERN_PROC_CWD: return (sysctl_proc_cwd(name, namelen, oldp, oldlenp, p)); case KERN_PROC_NOBROADCASTKILL: return (sysctl_proc_nobroadcastkill(name, namelen, newp, newlen, oldp, oldlenp, p)); case KERN_PROC_VMMAP: return (sysctl_proc_vmmap(name, namelen, oldp, oldlenp, p)); case KERN_FILE: return (sysctl_file(name, namelen, oldp, oldlenp, p)); #endif #if defined(GPROF) || defined(DDBPROF) case KERN_PROF: return (sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen)); #endif case KERN_MALLOCSTATS: return (sysctl_malloc(name, namelen, oldp, oldlenp, newp, newlen, p)); case KERN_TTY: return (sysctl_tty(name, namelen, oldp, oldlenp, newp, newlen)); case KERN_POOL: return (sysctl_dopool(name, namelen, oldp, oldlenp)); #if defined(SYSVMSG) || defined(SYSVSEM) || defined(SYSVSHM) case KERN_SYSVIPC_INFO: return (sysctl_sysvipc(name, namelen, oldp, oldlenp)); #endif #ifdef SYSVSEM case KERN_SEMINFO: return (sysctl_sysvsem(name, namelen, oldp, oldlenp, newp, newlen)); #endif #ifdef SYSVSHM case KERN_SHMINFO: return (sysctl_sysvshm(name, namelen, oldp, oldlenp, newp, newlen)); #endif #ifndef SMALL_KERNEL case KERN_INTRCNT: return (sysctl_intrcnt(name, namelen, oldp, oldlenp)); case KERN_WATCHDOG: return (sysctl_wdog(name, namelen, oldp, oldlenp, newp, newlen)); #endif #ifndef SMALL_KERNEL case KERN_EVCOUNT: return (evcount_sysctl(name, namelen, oldp, oldlenp, newp, newlen)); #endif case KERN_TIMECOUNTER: return (sysctl_tc(name, namelen, oldp, oldlenp, newp, newlen)); case KERN_CPTIME2: return (sysctl_cptime2(name, namelen, oldp, oldlenp, newp, newlen)); #ifdef WITNESS case KERN_WITNESSWATCH: return witness_sysctl_watch(oldp, oldlenp, newp, newlen); case KERN_WITNESS: return witness_sysctl(name, namelen, oldp, oldlenp, newp, newlen); #endif #if NAUDIO > 0 case KERN_AUDIO: return (sysctl_audio(name, namelen, oldp, oldlenp, newp, newlen)); #endif #if NVIDEO > 0 case KERN_VIDEO: return (sysctl_video(name, namelen, oldp, oldlenp, newp, newlen)); #endif case KERN_CPUSTATS: return (sysctl_cpustats(name, namelen, oldp, oldlenp, newp, newlen)); default: return (ENOTDIR); /* overloaded */ } } /* * kernel related system variables. */ int kern_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct proc *p) { int error, level, inthostid, stackgap; dev_t dev; extern int pool_debug; /* dispatch the non-terminal nodes first */ if (namelen != 1) { return kern_sysctl_dirs(name[0], name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p); } switch (name[0]) { case KERN_OSTYPE: return (sysctl_rdstring(oldp, oldlenp, newp, ostype)); case KERN_OSRELEASE: return (sysctl_rdstring(oldp, oldlenp, newp, osrelease)); case KERN_OSVERSION: return (sysctl_rdstring(oldp, oldlenp, newp, osversion)); case KERN_VERSION: return (sysctl_rdstring(oldp, oldlenp, newp, version)); case KERN_NUMVNODES: /* XXX numvnodes is a long */ return (sysctl_rdint(oldp, oldlenp, newp, numvnodes)); case KERN_SECURELVL: level = securelevel; if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &level)) || newp == NULL) return (error); if ((securelevel > 0 || level < -1) && level < securelevel && p->p_p->ps_pid != 1) return (EPERM); securelevel = level; return (0); case KERN_ALLOWDT: if (securelevel > 0) return (sysctl_rdint(oldp, oldlenp, newp, allowdt)); return (sysctl_int(oldp, oldlenp, newp, newlen, &allowdt)); case KERN_ALLOWKMEM: if (securelevel > 0) return (sysctl_rdint(oldp, oldlenp, newp, allowkmem)); return (sysctl_int(oldp, oldlenp, newp, newlen, &allowkmem)); case KERN_HOSTNAME: error = sysctl_tstring(oldp, oldlenp, newp, newlen, hostname, sizeof(hostname)); if (newp && !error) hostnamelen = newlen; return (error); case KERN_DOMAINNAME: error = sysctl_tstring(oldp, oldlenp, newp, newlen, domainname, sizeof(domainname)); if (newp && !error) domainnamelen = newlen; return (error); case KERN_HOSTID: inthostid = hostid; /* XXX assumes sizeof long <= sizeof int */ error = sysctl_int(oldp, oldlenp, newp, newlen, &inthostid); hostid = inthostid; return (error); case KERN_CLOCKRATE: return (sysctl_clockrate(oldp, oldlenp, newp)); case KERN_BOOTTIME: { struct timeval bt; memset(&bt, 0, sizeof bt); microboottime(&bt); return (sysctl_rdstruct(oldp, oldlenp, newp, &bt, sizeof bt)); } case KERN_MBSTAT: { extern struct cpumem *mbstat; uint64_t counters[MBSTAT_COUNT]; struct mbstat mbs; unsigned int i; memset(&mbs, 0, sizeof(mbs)); counters_read(mbstat, counters, MBSTAT_COUNT); for (i = 0; i < MBSTAT_TYPES; i++) mbs.m_mtypes[i] = counters[i]; mbs.m_drops = counters[MBSTAT_DROPS]; mbs.m_wait = counters[MBSTAT_WAIT]; mbs.m_drain = counters[MBSTAT_DRAIN]; return (sysctl_rdstruct(oldp, oldlenp, newp, &mbs, sizeof(mbs))); } case KERN_MSGBUFSIZE: case KERN_CONSBUFSIZE: { struct msgbuf *mp; mp = (name[0] == KERN_MSGBUFSIZE) ? msgbufp : consbufp; /* * deal with cases where the message buffer has * become corrupted. */ if (!mp || mp->msg_magic != MSG_MAGIC) return (ENXIO); return (sysctl_rdint(oldp, oldlenp, newp, mp->msg_bufs)); } case KERN_CONSBUF: if ((error = suser(p))) return (error); /* FALLTHROUGH */ case KERN_MSGBUF: { struct msgbuf *mp; mp = (name[0] == KERN_MSGBUF) ? msgbufp : consbufp; /* see note above */ if (!mp || mp->msg_magic != MSG_MAGIC) return (ENXIO); return (sysctl_rdstruct(oldp, oldlenp, newp, mp, mp->msg_bufs + offsetof(struct msgbuf, msg_bufc))); } case KERN_CPTIME: { CPU_INFO_ITERATOR cii; struct cpu_info *ci; long cp_time[CPUSTATES]; int i, n = 0; memset(cp_time, 0, sizeof(cp_time)); CPU_INFO_FOREACH(cii, ci) { if (!cpu_is_online(ci)) continue; n++; for (i = 0; i < CPUSTATES; i++) cp_time[i] += ci->ci_schedstate.spc_cp_time[i]; } for (i = 0; i < CPUSTATES; i++) cp_time[i] /= n; return (sysctl_rdstruct(oldp, oldlenp, newp, &cp_time, sizeof(cp_time))); } case KERN_NCHSTATS: return (sysctl_rdstruct(oldp, oldlenp, newp, &nchstats, sizeof(struct nchstats))); case KERN_FORKSTAT: return (sysctl_rdstruct(oldp, oldlenp, newp, &forkstat, sizeof(struct forkstat))); case KERN_STACKGAPRANDOM: stackgap = stackgap_random; error = sysctl_int(oldp, oldlenp, newp, newlen, &stackgap); if (error) return (error); /* * Safety harness. */ if ((stackgap < ALIGNBYTES && stackgap != 0) || !powerof2(stackgap) || stackgap >= MAXSSIZ) return (EINVAL); stackgap_random = stackgap; return (0); case KERN_MAXCLUSTERS: { int val = nmbclust; error = sysctl_int(oldp, oldlenp, newp, newlen, &val); if (error == 0 && val != nmbclust) error = nmbclust_update(val); return (error); } case KERN_CACHEPCT: { u_int64_t dmapages; int opct, pgs; opct = bufcachepercent; error = sysctl_int(oldp, oldlenp, newp, newlen, &bufcachepercent); if (error) return(error); if (bufcachepercent > 90 || bufcachepercent < 5) { bufcachepercent = opct; return (EINVAL); } dmapages = uvm_pagecount(&dma_constraint); if (bufcachepercent != opct) { pgs = bufcachepercent * dmapages / 100; bufadjust(pgs); /* adjust bufpages */ bufhighpages = bufpages; /* set high water mark */ } return(0); } case KERN_CONSDEV: if (cn_tab != NULL) dev = cn_tab->cn_dev; else dev = NODEV; return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev)); case KERN_POOL_DEBUG: { int old_pool_debug = pool_debug; error = sysctl_int(oldp, oldlenp, newp, newlen, &pool_debug); if (error == 0 && pool_debug != old_pool_debug) pool_reclaim_all(); return (error); } #if NPF > 0 case KERN_PFSTATUS: return (pf_sysctl(oldp, oldlenp, newp, newlen)); #endif case KERN_TIMEOUT_STATS: return (timeout_sysctl(oldp, oldlenp, newp, newlen)); case KERN_UTC_OFFSET: return (sysctl_utc_offset(oldp, oldlenp, newp, newlen)); default: return (sysctl_bounded_arr(kern_vars, nitems(kern_vars), name, namelen, oldp, oldlenp, newp, newlen)); } /* NOTREACHED */ } /* * hardware related system variables. */ char *hw_vendor, *hw_prod, *hw_uuid, *hw_serial, *hw_ver; int allowpowerdown = 1; /* morally const values reported by sysctl_bounded_arr */ static int byte_order = BYTE_ORDER; static int page_size = PAGE_SIZE; const struct sysctl_bounded_args hw_vars[] = { {HW_NCPU, &ncpus, 1, 0}, {HW_NCPUFOUND, &ncpusfound, 1, 0}, {HW_BYTEORDER, &byte_order, 1, 0}, {HW_PAGESIZE, &page_size, 1, 0}, {HW_DISKCOUNT, &disk_count, 1, 0}, }; int hw_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct proc *p) { extern char machine[], cpu_model[]; int err, cpuspeed; /* all sysctl names at this level except sensors are terminal */ if (name[0] != HW_SENSORS && namelen != 1) return (ENOTDIR); /* overloaded */ switch (name[0]) { case HW_MACHINE: return (sysctl_rdstring(oldp, oldlenp, newp, machine)); case HW_MODEL: return (sysctl_rdstring(oldp, oldlenp, newp, cpu_model)); case HW_NCPUONLINE: return (sysctl_rdint(oldp, oldlenp, newp, sysctl_hwncpuonline())); case HW_PHYSMEM: return (sysctl_rdint(oldp, oldlenp, newp, ptoa(physmem))); case HW_USERMEM: return (sysctl_rdint(oldp, oldlenp, newp, ptoa(physmem - uvmexp.wired))); case HW_DISKNAMES: err = sysctl_diskinit(0, p); if (err) return err; if (disknames) return (sysctl_rdstring(oldp, oldlenp, newp, disknames)); else return (sysctl_rdstring(oldp, oldlenp, newp, "")); case HW_DISKSTATS: err = sysctl_diskinit(1, p); if (err) return err; return (sysctl_rdstruct(oldp, oldlenp, newp, diskstats, disk_count * sizeof(struct diskstats))); case HW_CPUSPEED: if (!cpu_cpuspeed) return (EOPNOTSUPP); err = cpu_cpuspeed(&cpuspeed); if (err) return err; return (sysctl_rdint(oldp, oldlenp, newp, cpuspeed)); #ifndef SMALL_KERNEL case HW_SENSORS: return (sysctl_sensors(name + 1, namelen - 1, oldp, oldlenp, newp, newlen)); case HW_SETPERF: return (sysctl_hwsetperf(oldp, oldlenp, newp, newlen)); case HW_PERFPOLICY: return (sysctl_hwperfpolicy(oldp, oldlenp, newp, newlen)); #endif /* !SMALL_KERNEL */ case HW_VENDOR: if (hw_vendor) return (sysctl_rdstring(oldp, oldlenp, newp, hw_vendor)); else return (EOPNOTSUPP); case HW_PRODUCT: if (hw_prod) return (sysctl_rdstring(oldp, oldlenp, newp, hw_prod)); else return (EOPNOTSUPP); case HW_VERSION: if (hw_ver) return (sysctl_rdstring(oldp, oldlenp, newp, hw_ver)); else return (EOPNOTSUPP); case HW_SERIALNO: if (hw_serial) return (sysctl_rdstring(oldp, oldlenp, newp, hw_serial)); else return (EOPNOTSUPP); case HW_UUID: if (hw_uuid) return (sysctl_rdstring(oldp, oldlenp, newp, hw_uuid)); else return (EOPNOTSUPP); case HW_PHYSMEM64: return (sysctl_rdquad(oldp, oldlenp, newp, ptoa((psize_t)physmem))); case HW_USERMEM64: return (sysctl_rdquad(oldp, oldlenp, newp, ptoa((psize_t)physmem - uvmexp.wired))); case HW_ALLOWPOWERDOWN: if (securelevel > 0) return (sysctl_rdint(oldp, oldlenp, newp, allowpowerdown)); return (sysctl_int(oldp, oldlenp, newp, newlen, &allowpowerdown)); #ifdef __HAVE_CPU_TOPOLOGY case HW_SMT: return (sysctl_hwsmt(oldp, oldlenp, newp, newlen)); #endif default: return sysctl_bounded_arr(hw_vars, nitems(hw_vars), name, namelen, oldp, oldlenp, newp, newlen); } /* NOTREACHED */ } #ifdef DEBUG_SYSCTL /* * Debugging related system variables. */ extern struct ctldebug debug_vfs_busyprt; struct ctldebug debug1, debug2, debug3, debug4; struct ctldebug debug5, debug6, debug7, debug8, debug9; struct ctldebug debug10, debug11, debug12, debug13, debug14; struct ctldebug debug15, debug16, debug17, debug18, debug19; static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { &debug_vfs_busyprt, &debug1, &debug2, &debug3, &debug4, &debug5, &debug6, &debug7, &debug8, &debug9, &debug10, &debug11, &debug12, &debug13, &debug14, &debug15, &debug16, &debug17, &debug18, &debug19, }; int debug_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct proc *p) { struct ctldebug *cdp; /* all sysctl names at this level are name and field */ if (namelen != 2) return (ENOTDIR); /* overloaded */ if (name[0] < 0 || name[0] >= nitems(debugvars)) return (EOPNOTSUPP); cdp = debugvars[name[0]]; if (cdp->debugname == 0) return (EOPNOTSUPP); switch (name[1]) { case CTL_DEBUG_NAME: return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname)); case CTL_DEBUG_VALUE: return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar)); default: return (EOPNOTSUPP); } /* NOTREACHED */ } #endif /* DEBUG_SYSCTL */ /* * Reads, or writes that lower the value */ int sysctl_int_lower(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int *valp) { unsigned int oval = *valp, val = *valp; int error; if (newp == NULL) return (sysctl_rdint(oldp, oldlenp, newp, *valp)); if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &val))) return (error); if (val > oval) return (EPERM); /* do not allow raising */ *(unsigned int *)valp = val; return (0); } /* * Validate parameters and get old / set new parameters * for an integer-valued sysctl function. */ int sysctl_int(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int *valp) { return (sysctl_int_bounded(oldp, oldlenp, newp, newlen, valp, 0, 0)); } int sysctl_int_bounded(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int *valp, int minimum, int maximum) { int error = 0; int val; if (oldp && *oldlenp < sizeof(int)) return (ENOMEM); if (newp && newlen != sizeof(int)) return (EINVAL); *oldlenp = sizeof(int); val = *valp; if (oldp) error = copyout(&val, oldp, sizeof(int)); if (error == 0 && newp) error = copyin(newp, &val, sizeof(int)); if (error) return (error); if (minimum == maximum || (minimum <= val && val <= maximum)) *valp = val; else error = EINVAL; return (error); } /* * As above, but read-only. */ int sysctl_rdint(void *oldp, size_t *oldlenp, void *newp, int val) { int error = 0; if (oldp && *oldlenp < sizeof(int)) return (ENOMEM); if (newp) return (EPERM); *oldlenp = sizeof(int); if (oldp) error = copyout((caddr_t)&val, oldp, sizeof(int)); return (error); } /* * Array of bounded integer values. */ int sysctl_bounded_arr(const struct sysctl_bounded_args *valpp, u_int valplen, int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { u_int i; if (namelen != 1) return (ENOTDIR); for (i = 0; i < valplen; ++i) { if (valpp[i].mib == name[0]) { if (valpp[i].minimum <= valpp[i].maximum) { return (sysctl_int_bounded(oldp, oldlenp, newp, newlen, valpp[i].var, valpp[i].minimum, valpp[i].maximum)); } else { return (sysctl_rdint(oldp, oldlenp, newp, *valpp[i].var)); } } } return (EOPNOTSUPP); } /* * Validate parameters and get old / set new parameters * for an integer-valued sysctl function. */ int sysctl_quad(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int64_t *valp) { int error = 0; if (oldp && *oldlenp < sizeof(int64_t)) return (ENOMEM); if (newp && newlen != sizeof(int64_t)) return (EINVAL); *oldlenp = sizeof(int64_t); if (oldp) error = copyout(valp, oldp, sizeof(int64_t)); if (error == 0 && newp) error = copyin(newp, valp, sizeof(int64_t)); return (error); } /* * As above, but read-only. */ int sysctl_rdquad(void *oldp, size_t *oldlenp, void *newp, int64_t val) { int error = 0; if (oldp && *oldlenp < sizeof(int64_t)) return (ENOMEM); if (newp) return (EPERM); *oldlenp = sizeof(int64_t); if (oldp) error = copyout((caddr_t)&val, oldp, sizeof(int64_t)); return (error); } /* * Validate parameters and get old / set new parameters * for a string-valued sysctl function. */ int sysctl_string(void *oldp, size_t *oldlenp, void *newp, size_t newlen, char *str, size_t maxlen) { return sysctl__string(oldp, oldlenp, newp, newlen, str, maxlen, 0); } int sysctl_tstring(void *oldp, size_t *oldlenp, void *newp, size_t newlen, char *str, size_t maxlen) { return sysctl__string(oldp, oldlenp, newp, newlen, str, maxlen, 1); } int sysctl__string(void *oldp, size_t *oldlenp, void *newp, size_t newlen, char *str, size_t maxlen, int trunc) { size_t len; int error = 0; len = strlen(str) + 1; if (oldp && *oldlenp < len) { if (trunc == 0 || *oldlenp == 0) return (ENOMEM); } if (newp && newlen >= maxlen) return (EINVAL); if (oldp) { if (trunc && *oldlenp < len) { len = *oldlenp; error = copyout(str, oldp, len - 1); if (error == 0) error = copyout("", (char *)oldp + len - 1, 1); } else { error = copyout(str, oldp, len); } } *oldlenp = len; if (error == 0 && newp) { error = copyin(newp, str, newlen); str[newlen] = 0; } return (error); } /* * As above, but read-only. */ int sysctl_rdstring(void *oldp, size_t *oldlenp, void *newp, const char *str) { size_t len; int error = 0; len = strlen(str) + 1; if (oldp && *oldlenp < len) return (ENOMEM); if (newp) return (EPERM); *oldlenp = len; if (oldp) error = copyout(str, oldp, len); return (error); } /* * Validate parameters and get old / set new parameters * for a structure oriented sysctl function. */ int sysctl_struct(void *oldp, size_t *oldlenp, void *newp, size_t newlen, void *sp, size_t len) { int error = 0; if (oldp && *oldlenp < len) return (ENOMEM); if (newp && newlen > len) return (EINVAL); if (oldp) { *oldlenp = len; error = copyout(sp, oldp, len); } if (error == 0 && newp) error = copyin(newp, sp, len); return (error); } /* * Validate parameters and get old parameters * for a structure oriented sysctl function. */ int sysctl_rdstruct(void *oldp, size_t *oldlenp, void *newp, const void *sp, size_t len) { int error = 0; if (oldp && *oldlenp < len) return (ENOMEM); if (newp) return (EPERM); *oldlenp = len; if (oldp) error = copyout(sp, oldp, len); return (error); } #ifndef SMALL_KERNEL void fill_file(struct kinfo_file *kf, struct file *fp, struct filedesc *fdp, int fd, struct vnode *vp, struct process *pr, struct proc *p, struct socket *so, int show_pointers) { struct vattr va; memset(kf, 0, sizeof(*kf)); kf->fd_fd = fd; /* might not really be an fd */ if (fp != NULL) { if (show_pointers) kf->f_fileaddr = PTRTOINT64(fp); kf->f_flag = fp->f_flag; kf->f_iflags = fp->f_iflags; kf->f_type = fp->f_type; kf->f_count = fp->f_count; if (show_pointers) kf->f_ucred = PTRTOINT64(fp->f_cred); kf->f_uid = fp->f_cred->cr_uid; kf->f_gid = fp->f_cred->cr_gid; if (show_pointers) kf->f_ops = PTRTOINT64(fp->f_ops); if (show_pointers) kf->f_data = PTRTOINT64(fp->f_data); kf->f_usecount = 0; if (suser(p) == 0 || p->p_ucred->cr_uid == fp->f_cred->cr_uid) { mtx_enter(&fp->f_mtx); kf->f_offset = fp->f_offset; kf->f_rxfer = fp->f_rxfer; kf->f_rwfer = fp->f_wxfer; kf->f_seek = fp->f_seek; kf->f_rbytes = fp->f_rbytes; kf->f_wbytes = fp->f_wbytes; mtx_leave(&fp->f_mtx); } else kf->f_offset = -1; } else if (vp != NULL) { /* fake it */ kf->f_type = DTYPE_VNODE; kf->f_flag = FREAD; if (fd == KERN_FILE_TRACE) kf->f_flag |= FWRITE; } else if (so != NULL) { /* fake it */ kf->f_type = DTYPE_SOCKET; } /* information about the object associated with this file */ switch (kf->f_type) { case DTYPE_VNODE: if (fp != NULL) vp = (struct vnode *)fp->f_data; if (show_pointers) kf->v_un = PTRTOINT64(vp->v_un.vu_socket); kf->v_type = vp->v_type; kf->v_tag = vp->v_tag; kf->v_flag = vp->v_flag; if (show_pointers) kf->v_data = PTRTOINT64(vp->v_data); if (show_pointers) kf->v_mount = PTRTOINT64(vp->v_mount); if (vp->v_mount) strlcpy(kf->f_mntonname, vp->v_mount->mnt_stat.f_mntonname, sizeof(kf->f_mntonname)); if (VOP_GETATTR(vp, &va, p->p_ucred, p) == 0) { kf->va_fileid = va.va_fileid; kf->va_mode = MAKEIMODE(va.va_type, va.va_mode); kf->va_size = va.va_size; kf->va_rdev = va.va_rdev; kf->va_fsid = va.va_fsid & 0xffffffff; kf->va_nlink = va.va_nlink; } break; case DTYPE_SOCKET: { int locked = 0; if (so == NULL) { so = (struct socket *)fp->f_data; /* if so is passed as parameter it is already locked */ switch (so->so_proto->pr_domain->dom_family) { case AF_INET: case AF_INET6: NET_LOCK(); locked = 1; break; } } kf->so_type = so->so_type; kf->so_state = so->so_state; if (show_pointers) kf->so_pcb = PTRTOINT64(so->so_pcb); else kf->so_pcb = -1; kf->so_protocol = so->so_proto->pr_protocol; kf->so_family = so->so_proto->pr_domain->dom_family; kf->so_rcv_cc = so->so_rcv.sb_cc; kf->so_snd_cc = so->so_snd.sb_cc; if (isspliced(so)) { if (show_pointers) kf->so_splice = PTRTOINT64(so->so_sp->ssp_socket); kf->so_splicelen = so->so_sp->ssp_len; } else if (issplicedback(so)) kf->so_splicelen = -1; if (so->so_pcb == NULL) { if (locked) NET_UNLOCK(); break; } switch (kf->so_family) { case AF_INET: { struct inpcb *inpcb = so->so_pcb; NET_ASSERT_LOCKED(); if (show_pointers) kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb); kf->inp_lport = inpcb->inp_lport; kf->inp_laddru[0] = inpcb->inp_laddr.s_addr; kf->inp_fport = inpcb->inp_fport; kf->inp_faddru[0] = inpcb->inp_faddr.s_addr; kf->inp_rtableid = inpcb->inp_rtableid; if (so->so_type == SOCK_RAW) kf->inp_proto = inpcb->inp_ip.ip_p; if (so->so_proto->pr_protocol == IPPROTO_TCP) { struct tcpcb *tcpcb = (void *)inpcb->inp_ppcb; kf->t_rcv_wnd = tcpcb->rcv_wnd; kf->t_snd_wnd = tcpcb->snd_wnd; kf->t_snd_cwnd = tcpcb->snd_cwnd; kf->t_state = tcpcb->t_state; } break; } case AF_INET6: { struct inpcb *inpcb = so->so_pcb; NET_ASSERT_LOCKED(); if (show_pointers) kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb); kf->inp_lport = inpcb->inp_lport; kf->inp_laddru[0] = inpcb->inp_laddr6.s6_addr32[0]; kf->inp_laddru[1] = inpcb->inp_laddr6.s6_addr32[1]; kf->inp_laddru[2] = inpcb->inp_laddr6.s6_addr32[2]; kf->inp_laddru[3] = inpcb->inp_laddr6.s6_addr32[3]; kf->inp_fport = inpcb->inp_fport; kf->inp_faddru[0] = inpcb->inp_faddr6.s6_addr32[0]; kf->inp_faddru[1] = inpcb->inp_faddr6.s6_addr32[1]; kf->inp_faddru[2] = inpcb->inp_faddr6.s6_addr32[2]; kf->inp_faddru[3] = inpcb->inp_faddr6.s6_addr32[3]; kf->inp_rtableid = inpcb->inp_rtableid; if (so->so_type == SOCK_RAW) kf->inp_proto = inpcb->inp_ipv6.ip6_nxt; if (so->so_proto->pr_protocol == IPPROTO_TCP) { struct tcpcb *tcpcb = (void *)inpcb->inp_ppcb; kf->t_rcv_wnd = tcpcb->rcv_wnd; kf->t_snd_wnd = tcpcb->snd_wnd; kf->t_state = tcpcb->t_state; } break; } case AF_UNIX: { struct unpcb *unpcb = so->so_pcb; kf->f_msgcount = unpcb->unp_msgcount; if (show_pointers) { kf->unp_conn = PTRTOINT64(unpcb->unp_conn); kf->unp_refs = PTRTOINT64( SLIST_FIRST(&unpcb->unp_refs)); kf->unp_nextref = PTRTOINT64( SLIST_NEXT(unpcb, unp_nextref)); kf->v_un = PTRTOINT64(unpcb->unp_vnode); kf->unp_addr = PTRTOINT64(unpcb->unp_addr); } if (unpcb->unp_addr != NULL) { struct sockaddr_un *un = mtod(unpcb->unp_addr, struct sockaddr_un *); memcpy(kf->unp_path, un->sun_path, un->sun_len - offsetof(struct sockaddr_un,sun_path)); } break; } } if (locked) NET_UNLOCK(); break; } case DTYPE_PIPE: { struct pipe *pipe = (struct pipe *)fp->f_data; if (show_pointers) kf->pipe_peer = PTRTOINT64(pipe->pipe_peer); kf->pipe_state = pipe->pipe_state; break; } case DTYPE_KQUEUE: { struct kqueue *kqi = (struct kqueue *)fp->f_data; kf->kq_count = kqi->kq_count; kf->kq_state = kqi->kq_state; break; } } /* per-process information for KERN_FILE_BY[PU]ID */ if (pr != NULL) { kf->p_pid = pr->ps_pid; kf->p_uid = pr->ps_ucred->cr_uid; kf->p_gid = pr->ps_ucred->cr_gid; kf->p_tid = -1; strlcpy(kf->p_comm, pr->ps_comm, sizeof(kf->p_comm)); } if (fdp != NULL) { fdplock(fdp); kf->fd_ofileflags = fdp->fd_ofileflags[fd]; fdpunlock(fdp); } } /* * Get file structures. */ int sysctl_file(int *name, u_int namelen, char *where, size_t *sizep, struct proc *p) { struct kinfo_file *kf; struct filedesc *fdp; struct file *fp; struct process *pr; size_t buflen, elem_size, elem_count, outsize; char *dp = where; int arg, i, error = 0, needed = 0, matched; u_int op; int show_pointers; if (namelen > 4) return (ENOTDIR); if (namelen < 4 || name[2] > sizeof(*kf)) return (EINVAL); buflen = where != NULL ? *sizep : 0; op = name[0]; arg = name[1]; elem_size = name[2]; elem_count = name[3]; outsize = MIN(sizeof(*kf), elem_size); if (elem_size < 1) return (EINVAL); show_pointers = suser(curproc) == 0; kf = malloc(sizeof(*kf), M_TEMP, M_WAITOK); #define FILLIT2(fp, fdp, i, vp, pr, so) do { \ if (buflen >= elem_size && elem_count > 0) { \ fill_file(kf, fp, fdp, i, vp, pr, p, so, show_pointers);\ error = copyout(kf, dp, outsize); \ if (error) \ break; \ dp += elem_size; \ buflen -= elem_size; \ elem_count--; \ } \ needed += elem_size; \ } while (0) #define FILLIT(fp, fdp, i, vp, pr) \ FILLIT2(fp, fdp, i, vp, pr, NULL) #define FILLSO(so) \ FILLIT2(NULL, NULL, 0, NULL, NULL, so) switch (op) { case KERN_FILE_BYFILE: /* use the inp-tables to pick up closed connections, too */ if (arg == DTYPE_SOCKET) { struct inpcb *inp; NET_LOCK(); TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); #ifdef INET6 TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue, inp_queue) FILLSO(inp->inp_socket); #endif NET_UNLOCK(); } fp = NULL; while ((fp = fd_iterfile(fp, p)) != NULL) { if ((arg == 0 || fp->f_type == arg)) { int af, skip = 0; if (arg == DTYPE_SOCKET && fp->f_type == arg) { af = ((struct socket *)fp->f_data)-> so_proto->pr_domain->dom_family; if (af == AF_INET || af == AF_INET6) skip = 1; } if (!skip) FILLIT(fp, NULL, 0, NULL, NULL); } } break; case KERN_FILE_BYPID: /* A arg of -1 indicates all processes */ if (arg < -1) { error = EINVAL; break; } matched = 0; LIST_FOREACH(pr, &allprocess, ps_list) { /* * skip system, exiting, embryonic and undead * processes */ if (pr->ps_flags & (PS_SYSTEM | PS_EMBRYO | PS_EXITING)) continue; if (arg > 0 && pr->ps_pid != (pid_t)arg) { /* not the pid we are looking for */ continue; } matched = 1; fdp = pr->ps_fd; if (pr->ps_textvp) FILLIT(NULL, NULL, KERN_FILE_TEXT, pr->ps_textvp, pr); if (fdp->fd_cdir) FILLIT(NULL, NULL, KERN_FILE_CDIR, fdp->fd_cdir, pr); if (fdp->fd_rdir) FILLIT(NULL, NULL, KERN_FILE_RDIR, fdp->fd_rdir, pr); if (pr->ps_tracevp) FILLIT(NULL, NULL, KERN_FILE_TRACE, pr->ps_tracevp, pr); for (i = 0; i < fdp->fd_nfiles; i++) { if ((fp = fd_getfile(fdp, i)) == NULL) continue; FILLIT(fp, fdp, i, NULL, pr); FRELE(fp, p); } } if (!matched) error = ESRCH; break; case KERN_FILE_BYUID: LIST_FOREACH(pr, &allprocess, ps_list) { /* * skip system, exiting, embryonic and undead * processes */ if (pr->ps_flags & (PS_SYSTEM | PS_EMBRYO | PS_EXITING)) continue; if (arg >= 0 && pr->ps_ucred->cr_uid != (uid_t)arg) { /* not the uid we are looking for */ continue; } fdp = pr->ps_fd; if (fdp->fd_cdir) FILLIT(NULL, NULL, KERN_FILE_CDIR, fdp->fd_cdir, pr); if (fdp->fd_rdir) FILLIT(NULL, NULL, KERN_FILE_RDIR, fdp->fd_rdir, pr); if (pr->ps_tracevp) FILLIT(NULL, NULL, KERN_FILE_TRACE, pr->ps_tracevp, pr); for (i = 0; i < fdp->fd_nfiles; i++) { if ((fp = fd_getfile(fdp, i)) == NULL) continue; FILLIT(fp, fdp, i, NULL, pr); FRELE(fp, p); } } break; default: error = EINVAL; break; } free(kf, M_TEMP, sizeof(*kf)); if (!error) { if (where == NULL) needed += KERN_FILESLOP * elem_size; else if (*sizep < needed) error = ENOMEM; *sizep = needed; } return (error); } /* * try over estimating by 5 procs */ #define KERN_PROCSLOP 5 int sysctl_doproc(int *name, u_int namelen, char *where, size_t *sizep) { struct kinfo_proc *kproc = NULL; struct proc *p; struct process *pr; char *dp; int arg, buflen, doingzomb, elem_size, elem_count; int error, needed, op; int dothreads = 0; int show_pointers; dp = where; buflen = where != NULL ? *sizep : 0; needed = error = 0; if (namelen != 4 || name[2] <= 0 || name[3] < 0 || name[2] > sizeof(*kproc)) return (EINVAL); op = name[0]; arg = name[1]; elem_size = name[2]; elem_count = name[3]; dothreads = op & KERN_PROC_SHOW_THREADS; op &= ~KERN_PROC_SHOW_THREADS; show_pointers = suser(curproc) == 0; if (where != NULL) kproc = malloc(sizeof(*kproc), M_TEMP, M_WAITOK); pr = LIST_FIRST(&allprocess); doingzomb = 0; again: for (; pr != NULL; pr = LIST_NEXT(pr, ps_list)) { /* XXX skip processes in the middle of being zapped */ if (pr->ps_pgrp == NULL) continue; /* * Skip embryonic processes. */ if (pr->ps_flags & PS_EMBRYO) continue; /* * TODO - make more efficient (see notes below). */ switch (op) { case KERN_PROC_PID: /* could do this with just a lookup */ if (pr->ps_pid != (pid_t)arg) continue; break; case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (pr->ps_pgrp->pg_id != (pid_t)arg) continue; break; case KERN_PROC_SESSION: if (pr->ps_session->s_leader == NULL || pr->ps_session->s_leader->ps_pid != (pid_t)arg) continue; break; case KERN_PROC_TTY: if ((pr->ps_flags & PS_CONTROLT) == 0 || pr->ps_session->s_ttyp == NULL || pr->ps_session->s_ttyp->t_dev != (dev_t)arg) continue; break; case KERN_PROC_UID: if (pr->ps_ucred->cr_uid != (uid_t)arg) continue; break; case KERN_PROC_RUID: if (pr->ps_ucred->cr_ruid != (uid_t)arg) continue; break; case KERN_PROC_ALL: if (pr->ps_flags & PS_SYSTEM) continue; break; case KERN_PROC_KTHREAD: /* no filtering */ break; default: error = EINVAL; goto err; } if (buflen >= elem_size && elem_count > 0) { fill_kproc(pr, kproc, NULL, show_pointers); error = copyout(kproc, dp, elem_size); if (error) goto err; dp += elem_size; buflen -= elem_size; elem_count--; } needed += elem_size; /* Skip per-thread entries if not required by op */ if (!dothreads) continue; TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) { if (buflen >= elem_size && elem_count > 0) { fill_kproc(pr, kproc, p, show_pointers); error = copyout(kproc, dp, elem_size); if (error) goto err; dp += elem_size; buflen -= elem_size; elem_count--; } needed += elem_size; } } if (doingzomb == 0) { pr = LIST_FIRST(&zombprocess); doingzomb++; goto again; } if (where != NULL) { *sizep = dp - where; if (needed > *sizep) { error = ENOMEM; goto err; } } else { needed += KERN_PROCSLOP * elem_size; *sizep = needed; } err: if (kproc) free(kproc, M_TEMP, sizeof(*kproc)); return (error); } /* * Fill in a kproc structure for the specified process. */ void fill_kproc(struct process *pr, struct kinfo_proc *ki, struct proc *p, int show_pointers) { struct session *s = pr->ps_session; struct tty *tp; struct vmspace *vm = pr->ps_vmspace; struct timespec booted, st, ut, utc; int isthread; isthread = p != NULL; if (!isthread) p = pr->ps_mainproc; /* XXX */ FILL_KPROC(ki, strlcpy, p, pr, pr->ps_ucred, pr->ps_pgrp, p, pr, s, vm, pr->ps_limit, pr->ps_sigacts, isthread, show_pointers); /* stuff that's too painful to generalize into the macros */ if (pr->ps_pptr) ki->p_ppid = pr->ps_ppid; if (s->s_leader) ki->p_sid = s->s_leader->ps_pid; if ((pr->ps_flags & PS_CONTROLT) && (tp = s->s_ttyp)) { ki->p_tdev = tp->t_dev; ki->p_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : -1; if (show_pointers) ki->p_tsess = PTRTOINT64(tp->t_session); } else { ki->p_tdev = NODEV; ki->p_tpgid = -1; } /* fixups that can only be done in the kernel */ if ((pr->ps_flags & PS_ZOMBIE) == 0) { if ((pr->ps_flags & PS_EMBRYO) == 0 && vm != NULL) ki->p_vm_rssize = vm_resident_count(vm); calctsru(isthread ? &p->p_tu : &pr->ps_tu, &ut, &st, NULL); ki->p_uutime_sec = ut.tv_sec; ki->p_uutime_usec = ut.tv_nsec/1000; ki->p_ustime_sec = st.tv_sec; ki->p_ustime_usec = st.tv_nsec/1000; /* Convert starting uptime to a starting UTC time. */ nanoboottime(&booted); timespecadd(&booted, &pr->ps_start, &utc); ki->p_ustart_sec = utc.tv_sec; ki->p_ustart_usec = utc.tv_nsec / 1000; #ifdef MULTIPROCESSOR if (p->p_cpu != NULL) ki->p_cpuid = CPU_INFO_UNIT(p->p_cpu); #endif } /* get %cpu and schedule state: just one thread or sum of all? */ if (isthread) { ki->p_pctcpu = p->p_pctcpu; ki->p_stat = p->p_stat; } else { ki->p_pctcpu = 0; ki->p_stat = (pr->ps_flags & PS_ZOMBIE) ? SDEAD : SIDL; TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) { ki->p_pctcpu += p->p_pctcpu; /* find best state: ONPROC > RUN > STOP > SLEEP > .. */ if (p->p_stat == SONPROC || ki->p_stat == SONPROC) ki->p_stat = SONPROC; else if (p->p_stat == SRUN || ki->p_stat == SRUN) ki->p_stat = SRUN; else if (p->p_stat == SSTOP || ki->p_stat == SSTOP) ki->p_stat = SSTOP; else if (p->p_stat == SSLEEP) ki->p_stat = SSLEEP; } } } int sysctl_proc_args(int *name, u_int namelen, void *oldp, size_t *oldlenp, struct proc *cp) { struct process *vpr; pid_t pid; struct ps_strings pss; struct iovec iov; struct uio uio; int error, cnt, op; size_t limit; char **rargv, **vargv; /* reader vs. victim */ char *rarg, *varg, *buf; struct vmspace *vm; vaddr_t ps_strings; if (namelen > 2) return (ENOTDIR); if (namelen < 2) return (EINVAL); pid = name[0]; op = name[1]; switch (op) { case KERN_PROC_ARGV: case KERN_PROC_NARGV: case KERN_PROC_ENV: case KERN_PROC_NENV: break; default: return (EOPNOTSUPP); } if ((vpr = prfind(pid)) == NULL) return (ESRCH); if (oldp == NULL) { if (op == KERN_PROC_NARGV || op == KERN_PROC_NENV) *oldlenp = sizeof(int); else *oldlenp = ARG_MAX; /* XXX XXX XXX */ return (0); } /* Either system process or exiting/zombie */ if (vpr->ps_flags & (PS_SYSTEM | PS_EXITING)) return (EINVAL); /* Execing - danger. */ if ((vpr->ps_flags & PS_INEXEC)) return (EBUSY); /* Only owner or root can get env */ if ((op == KERN_PROC_NENV || op == KERN_PROC_ENV) && (vpr->ps_ucred->cr_uid != cp->p_ucred->cr_uid && (error = suser(cp)) != 0)) return (error); ps_strings = vpr->ps_strings; vm = vpr->ps_vmspace; uvmspace_addref(vm); vpr = NULL; buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK); iov.iov_base = &pss; iov.iov_len = sizeof(pss); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)ps_strings; uio.uio_resid = sizeof(pss); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = cp; if ((error = uvm_io(&vm->vm_map, &uio, 0)) != 0) goto out; if (op == KERN_PROC_NARGV) { error = sysctl_rdint(oldp, oldlenp, NULL, pss.ps_nargvstr); goto out; } if (op == KERN_PROC_NENV) { error = sysctl_rdint(oldp, oldlenp, NULL, pss.ps_nenvstr); goto out; } if (op == KERN_PROC_ARGV) { cnt = pss.ps_nargvstr; vargv = pss.ps_argvstr; } else { cnt = pss.ps_nenvstr; vargv = pss.ps_envstr; } /* -1 to have space for a terminating NUL */ limit = *oldlenp - 1; *oldlenp = 0; rargv = oldp; /* * *oldlenp - number of bytes copied out into readers buffer. * limit - maximal number of bytes allowed into readers buffer. * rarg - pointer into readers buffer where next arg will be stored. * rargv - pointer into readers buffer where the next rarg pointer * will be stored. * vargv - pointer into victim address space where the next argument * will be read. */ /* space for cnt pointers and a NULL */ rarg = (char *)(rargv + cnt + 1); *oldlenp += (cnt + 1) * sizeof(char **); while (cnt > 0 && *oldlenp < limit) { size_t len, vstrlen; /* Write to readers argv */ if ((error = copyout(&rarg, rargv, sizeof(rarg))) != 0) goto out; /* read the victim argv */ iov.iov_base = &varg; iov.iov_len = sizeof(varg); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)(vaddr_t)vargv; uio.uio_resid = sizeof(varg); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = cp; if ((error = uvm_io(&vm->vm_map, &uio, 0)) != 0) goto out; if (varg == NULL) break; /* * read the victim arg. We must jump through hoops to avoid * crossing a page boundary too much and returning an error. */ more: len = PAGE_SIZE - (((vaddr_t)varg) & PAGE_MASK); /* leave space for the terminating NUL */ iov.iov_base = buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)(vaddr_t)varg; uio.uio_resid = len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = cp; if ((error = uvm_io(&vm->vm_map, &uio, 0)) != 0) goto out; for (vstrlen = 0; vstrlen < len; vstrlen++) { if (buf[vstrlen] == '\0') break; } /* Don't overflow readers buffer. */ if (*oldlenp + vstrlen + 1 >= limit) { error = ENOMEM; goto out; } if ((error = copyout(buf, rarg, vstrlen)) != 0) goto out; *oldlenp += vstrlen; rarg += vstrlen; /* The string didn't end in this page? */ if (vstrlen == len) { varg += vstrlen; goto more; } /* End of string. Terminate it with a NUL */ buf[0] = '\0'; if ((error = copyout(buf, rarg, 1)) != 0) goto out; *oldlenp += 1; rarg += 1; vargv++; rargv++; cnt--; } if (*oldlenp >= limit) { error = ENOMEM; goto out; } /* Write the terminating null */ rarg = NULL; error = copyout(&rarg, rargv, sizeof(rarg)); out: uvmspace_free(vm); free(buf, M_TEMP, PAGE_SIZE); return (error); } int sysctl_proc_cwd(int *name, u_int namelen, void *oldp, size_t *oldlenp, struct proc *cp) { struct process *findpr; struct vnode *vp; pid_t pid; int error; size_t lenused, len; char *path, *bp, *bend; if (namelen > 1) return (ENOTDIR); if (namelen < 1) return (EINVAL); pid = name[0]; if ((findpr = prfind(pid)) == NULL) return (ESRCH); if (oldp == NULL) { *oldlenp = MAXPATHLEN * 4; return (0); } /* Either system process or exiting/zombie */ if (findpr->ps_flags & (PS_SYSTEM | PS_EXITING)) return (EINVAL); /* Only owner or root can get cwd */ if (findpr->ps_ucred->cr_uid != cp->p_ucred->cr_uid && (error = suser(cp)) != 0) return (error); len = *oldlenp; if (len > MAXPATHLEN * 4) len = MAXPATHLEN * 4; else if (len < 2) return (ERANGE); *oldlenp = 0; /* snag a reference to the vnode before we can sleep */ vp = findpr->ps_fd->fd_cdir; vref(vp); path = malloc(len, M_TEMP, M_WAITOK); bp = &path[len]; bend = bp; *(--bp) = '\0'; /* Same as sys__getcwd */ error = vfs_getcwd_common(vp, NULL, &bp, path, len / 2, GETCWD_CHECK_ACCESS, cp); if (error == 0) { *oldlenp = lenused = bend - bp; error = copyout(bp, oldp, lenused); } vrele(vp); free(path, M_TEMP, len); return (error); } int sysctl_proc_nobroadcastkill(int *name, u_int namelen, void *newp, size_t newlen, void *oldp, size_t *oldlenp, struct proc *cp) { struct process *findpr; pid_t pid; int error, flag; if (namelen > 1) return (ENOTDIR); if (namelen < 1) return (EINVAL); pid = name[0]; if ((findpr = prfind(pid)) == NULL) return (ESRCH); /* Either system process or exiting/zombie */ if (findpr->ps_flags & (PS_SYSTEM | PS_EXITING)) return (EINVAL); /* Only root can change PS_NOBROADCASTKILL */ if (newp != 0 && (error = suser(cp)) != 0) return (error); /* get the PS_NOBROADCASTKILL flag */ flag = findpr->ps_flags & PS_NOBROADCASTKILL ? 1 : 0; error = sysctl_int(oldp, oldlenp, newp, newlen, &flag); if (error == 0 && newp) { if (flag) atomic_setbits_int(&findpr->ps_flags, PS_NOBROADCASTKILL); else atomic_clearbits_int(&findpr->ps_flags, PS_NOBROADCASTKILL); } return (error); } /* Arbitrary but reasonable limit for one iteration. */ #define VMMAP_MAXLEN MAXPHYS int sysctl_proc_vmmap(int *name, u_int namelen, void *oldp, size_t *oldlenp, struct proc *cp) { struct process *findpr; pid_t pid; int error; size_t oldlen, len; struct kinfo_vmentry *kve, *ukve; u_long *ustart, start; if (namelen > 1) return (ENOTDIR); if (namelen < 1) return (EINVAL); /* Provide max buffer length as hint. */ if (oldp == NULL) { if (oldlenp == NULL) return (EINVAL); else { *oldlenp = VMMAP_MAXLEN; return (0); } } pid = name[0]; if (pid == cp->p_p->ps_pid) { /* Self process mapping. */ findpr = cp->p_p; } else if (pid > 0) { if ((findpr = prfind(pid)) == NULL) return (ESRCH); /* Either system process or exiting/zombie */ if (findpr->ps_flags & (PS_SYSTEM | PS_EXITING)) return (EINVAL); #if 1 /* XXX Allow only root for now */ if ((error = suser(cp)) != 0) return (error); #else /* Only owner or root can get vmmap */ if (findpr->ps_ucred->cr_uid != cp->p_ucred->cr_uid && (error = suser(cp)) != 0) return (error); #endif } else { /* Only root can get kernel_map */ if ((error = suser(cp)) != 0) return (error); findpr = NULL; } /* Check the given size. */ oldlen = *oldlenp; if (oldlen == 0 || oldlen % sizeof(*kve) != 0) return (EINVAL); /* Deny huge allocation. */ if (oldlen > VMMAP_MAXLEN) return (EINVAL); /* * Iterate from the given address passed as the first element's * kve_start via oldp. */ ukve = (struct kinfo_vmentry *)oldp; ustart = &ukve->kve_start; error = copyin(ustart, &start, sizeof(start)); if (error != 0) return (error); /* Allocate wired memory to not block. */ kve = malloc(oldlen, M_TEMP, M_WAITOK); /* Set the base address and read entries. */ kve[0].kve_start = start; len = oldlen; error = fill_vmmap(findpr, kve, &len); if (error != 0 && error != ENOMEM) goto done; if (len == 0) goto done; KASSERT(len <= oldlen); KASSERT((len % sizeof(struct kinfo_vmentry)) == 0); error = copyout(kve, oldp, len); done: *oldlenp = len; free(kve, M_TEMP, oldlen); return (error); } #endif /* * Initialize disknames/diskstats for export by sysctl. If update is set, * then we simply update the disk statistics information. */ int sysctl_diskinit(int update, struct proc *p) { struct diskstats *sdk; struct disk *dk; const char *duid; int i, tlen, l; if ((i = rw_enter(&sysctl_disklock, RW_WRITE|RW_INTR)) != 0) return i; if (disk_change) { for (dk = TAILQ_FIRST(&disklist), tlen = 0; dk; dk = TAILQ_NEXT(dk, dk_link)) { if (dk->dk_name) tlen += strlen(dk->dk_name); tlen += 18; /* label uid + separators */ } tlen++; if (disknames) free(disknames, M_SYSCTL, disknameslen); if (diskstats) free(diskstats, M_SYSCTL, diskstatslen); diskstats = NULL; disknames = NULL; diskstats = mallocarray(disk_count, sizeof(struct diskstats), M_SYSCTL, M_WAITOK|M_ZERO); diskstatslen = disk_count * sizeof(struct diskstats); disknames = malloc(tlen, M_SYSCTL, M_WAITOK|M_ZERO); disknameslen = tlen; disknames[0] = '\0'; for (dk = TAILQ_FIRST(&disklist), i = 0, l = 0; dk; dk = TAILQ_NEXT(dk, dk_link), i++) { duid = NULL; if (dk->dk_label && !duid_iszero(dk->dk_label->d_uid)) duid = duid_format(dk->dk_label->d_uid); snprintf(disknames + l, tlen - l, "%s:%s,", dk->dk_name ? dk->dk_name : "", duid ? duid : ""); l += strlen(disknames + l); sdk = diskstats + i; strlcpy(sdk->ds_name, dk->dk_name, sizeof(sdk->ds_name)); mtx_enter(&dk->dk_mtx); sdk->ds_busy = dk->dk_busy; sdk->ds_rxfer = dk->dk_rxfer; sdk->ds_wxfer = dk->dk_wxfer; sdk->ds_seek = dk->dk_seek; sdk->ds_rbytes = dk->dk_rbytes; sdk->ds_wbytes = dk->dk_wbytes; sdk->ds_attachtime = dk->dk_attachtime; sdk->ds_timestamp = dk->dk_timestamp; sdk->ds_time = dk->dk_time; mtx_leave(&dk->dk_mtx); } /* Eliminate trailing comma */ if (l != 0) disknames[l - 1] = '\0'; disk_change = 0; } else if (update) { /* Just update, number of drives hasn't changed */ for (dk = TAILQ_FIRST(&disklist), i = 0; dk; dk = TAILQ_NEXT(dk, dk_link), i++) { sdk = diskstats + i; strlcpy(sdk->ds_name, dk->dk_name, sizeof(sdk->ds_name)); mtx_enter(&dk->dk_mtx); sdk->ds_busy = dk->dk_busy; sdk->ds_rxfer = dk->dk_rxfer; sdk->ds_wxfer = dk->dk_wxfer; sdk->ds_seek = dk->dk_seek; sdk->ds_rbytes = dk->dk_rbytes; sdk->ds_wbytes = dk->dk_wbytes; sdk->ds_attachtime = dk->dk_attachtime; sdk->ds_timestamp = dk->dk_timestamp; sdk->ds_time = dk->dk_time; mtx_leave(&dk->dk_mtx); } } rw_exit_write(&sysctl_disklock); return 0; } #if defined(SYSVMSG) || defined(SYSVSEM) || defined(SYSVSHM) int sysctl_sysvipc(int *name, u_int namelen, void *where, size_t *sizep) { #ifdef SYSVSEM struct sem_sysctl_info *semsi; #endif #ifdef SYSVSHM struct shm_sysctl_info *shmsi; #endif size_t infosize, dssize, tsize, buflen, bufsiz; int i, nds, error, ret; void *buf; if (namelen != 1) return (EINVAL); buflen = *sizep; switch (*name) { case KERN_SYSVIPC_MSG_INFO: #ifdef SYSVMSG return (sysctl_sysvmsg(name, namelen, where, sizep)); #else return (EOPNOTSUPP); #endif case KERN_SYSVIPC_SEM_INFO: #ifdef SYSVSEM infosize = sizeof(semsi->seminfo); nds = seminfo.semmni; dssize = sizeof(semsi->semids[0]); break; #else return (EOPNOTSUPP); #endif case KERN_SYSVIPC_SHM_INFO: #ifdef SYSVSHM infosize = sizeof(shmsi->shminfo); nds = shminfo.shmmni; dssize = sizeof(shmsi->shmids[0]); break; #else return (EOPNOTSUPP); #endif default: return (EINVAL); } tsize = infosize + (nds * dssize); /* Return just the total size required. */ if (where == NULL) { *sizep = tsize; return (0); } /* Not enough room for even the info struct. */ if (buflen < infosize) { *sizep = 0; return (ENOMEM); } bufsiz = min(tsize, buflen); buf = malloc(bufsiz, M_TEMP, M_WAITOK|M_ZERO); switch (*name) { #ifdef SYSVSEM case KERN_SYSVIPC_SEM_INFO: semsi = (struct sem_sysctl_info *)buf; semsi->seminfo = seminfo; break; #endif #ifdef SYSVSHM case KERN_SYSVIPC_SHM_INFO: shmsi = (struct shm_sysctl_info *)buf; shmsi->shminfo = shminfo; break; #endif } buflen -= infosize; ret = 0; if (buflen > 0) { /* Fill in the IPC data structures. */ for (i = 0; i < nds; i++) { if (buflen < dssize) { ret = ENOMEM; break; } switch (*name) { #ifdef SYSVSEM case KERN_SYSVIPC_SEM_INFO: if (sema[i] != NULL) memcpy(&semsi->semids[i], sema[i], dssize); else memset(&semsi->semids[i], 0, dssize); break; #endif #ifdef SYSVSHM case KERN_SYSVIPC_SHM_INFO: if (shmsegs[i] != NULL) memcpy(&shmsi->shmids[i], shmsegs[i], dssize); else memset(&shmsi->shmids[i], 0, dssize); break; #endif } buflen -= dssize; } } *sizep -= buflen; error = copyout(buf, where, *sizep); free(buf, M_TEMP, bufsiz); /* If copyout succeeded, use return code set earlier. */ return (error ? error : ret); } #endif /* SYSVMSG || SYSVSEM || SYSVSHM */ #ifndef SMALL_KERNEL int sysctl_intrcnt(int *name, u_int namelen, void *oldp, size_t *oldlenp) { return (evcount_sysctl(name, namelen, oldp, oldlenp, NULL, 0)); } int sysctl_sensors(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { struct ksensor *ks; struct sensor *us; struct ksensordev *ksd; struct sensordev *usd; int dev, numt, ret; enum sensor_type type; if (namelen != 1 && namelen != 3) return (ENOTDIR); dev = name[0]; if (namelen == 1) { ret = sensordev_get(dev, &ksd); if (ret) return (ret); /* Grab a copy, to clear the kernel pointers */ usd = malloc(sizeof(*usd), M_TEMP, M_WAITOK|M_ZERO); usd->num = ksd->num; strlcpy(usd->xname, ksd->xname, sizeof(usd->xname)); memcpy(usd->maxnumt, ksd->maxnumt, sizeof(usd->maxnumt)); usd->sensors_count = ksd->sensors_count; ret = sysctl_rdstruct(oldp, oldlenp, newp, usd, sizeof(struct sensordev)); free(usd, M_TEMP, sizeof(*usd)); return (ret); } type = name[1]; numt = name[2]; ret = sensor_find(dev, type, numt, &ks); if (ret) return (ret); /* Grab a copy, to clear the kernel pointers */ us = malloc(sizeof(*us), M_TEMP, M_WAITOK|M_ZERO); memcpy(us->desc, ks->desc, sizeof(us->desc)); us->tv = ks->tv; us->value = ks->value; us->type = ks->type; us->status = ks->status; us->numt = ks->numt; us->flags = ks->flags; ret = sysctl_rdstruct(oldp, oldlenp, newp, us, sizeof(struct sensor)); free(us, M_TEMP, sizeof(*us)); return (ret); } #endif /* SMALL_KERNEL */ int sysctl_cptime2(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; int found = 0; if (namelen != 1) return (ENOTDIR); CPU_INFO_FOREACH(cii, ci) { if (name[0] == CPU_INFO_UNIT(ci)) { found = 1; break; } } if (!found) return (ENOENT); return (sysctl_rdstruct(oldp, oldlenp, newp, &ci->ci_schedstate.spc_cp_time, sizeof(ci->ci_schedstate.spc_cp_time))); } #if NAUDIO > 0 int sysctl_audio(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { if (namelen != 1) return (ENOTDIR); if (name[0] != KERN_AUDIO_RECORD) return (ENOENT); return (sysctl_int(oldp, oldlenp, newp, newlen, &audio_record_enable)); } #endif #if NVIDEO > 0 int sysctl_video(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { if (namelen != 1) return (ENOTDIR); if (name[0] != KERN_VIDEO_RECORD) return (ENOENT); return (sysctl_int(oldp, oldlenp, newp, newlen, &video_record_enable)); } #endif int sysctl_cpustats(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { CPU_INFO_ITERATOR cii; struct cpustats cs; struct cpu_info *ci; int found = 0; if (namelen != 1) return (ENOTDIR); CPU_INFO_FOREACH(cii, ci) { if (name[0] == CPU_INFO_UNIT(ci)) { found = 1; break; } } if (!found) return (ENOENT); memcpy(&cs.cs_time, &ci->ci_schedstate.spc_cp_time, sizeof(cs.cs_time)); cs.cs_flags = 0; if (cpu_is_online(ci)) cs.cs_flags |= CPUSTATS_ONLINE; return (sysctl_rdstruct(oldp, oldlenp, newp, &cs, sizeof(cs))); } int sysctl_utc_offset(void *oldp, size_t *oldlenp, void *newp, size_t newlen) { struct timespec adjusted, now; int adjustment_seconds, error, new_offset_minutes, old_offset_minutes; old_offset_minutes = utc_offset / 60; /* seconds -> minutes */ if (securelevel > 0) return sysctl_rdint(oldp, oldlenp, newp, old_offset_minutes); new_offset_minutes = old_offset_minutes; error = sysctl_int(oldp, oldlenp, newp, newlen, &new_offset_minutes); if (error) return error; if (new_offset_minutes < -24 * 60 || new_offset_minutes > 24 * 60) return EINVAL; if (new_offset_minutes == old_offset_minutes) return 0; utc_offset = new_offset_minutes * 60; /* minutes -> seconds */ adjustment_seconds = (new_offset_minutes - old_offset_minutes) * 60; nanotime(&now); adjusted = now; adjusted.tv_sec -= adjustment_seconds; tc_setrealtimeclock(&adjusted); resettodr(); return 0; }
2 2 /* $OpenBSD: tty_endrun.c,v 1.8 2018/02/19 08:59:52 mpi Exp $ */ /* * Copyright (c) 2008 Marc Balmer <mbalmer@openbsd.org> * Copyright (c) 2009 Kevin Steves <stevesk@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * A tty line discipline to decode the EndRun Technologies native * time-of-day message. * http://www.endruntechnologies.com/ */ /* * EndRun Format: * * T YYYY DDD HH:MM:SS zZZ m<CR><LF> * * T is the Time Figure of Merit (TFOM) character (described below). * This is the on-time character, transmitted during the first * millisecond of each second. * * YYYY is the year * DDD is the day-of-year * : is the colon character (0x3A) * HH is the hour of the day * MM is the minute of the hour * SS is the second of the minute * z is the sign of the offset to UTC, + implies time is ahead of UTC. * ZZ is the magnitude of the offset to UTC in units of half-hours. * Non-zero only when the Timemode is Local. * m is the Timemode character and is one of: * G = GPS * L = Local * U = UTC * <CR> is the ASCII carriage return character (0x0D) * <LF> is the ASCII line feed character (0x0A) */ #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/sensors.h> #include <sys/tty.h> #include <sys/conf.h> #include <sys/time.h> #ifdef ENDRUN_DEBUG #define DPRINTFN(n, x) do { if (endrundebug > (n)) printf x; } while (0) int endrundebug = 0; #else #define DPRINTFN(n, x) #endif #define DPRINTF(x) DPRINTFN(0, x) void endrunattach(int); #define ENDRUNLEN 27 /* strlen("6 2009 018 20:41:17 +00 U\r\n") */ #define NUMFLDS 6 #ifdef ENDRUN_DEBUG #define TRUSTTIME 30 #else #define TRUSTTIME (10 * 60) /* 10 minutes */ #endif int endrun_count, endrun_nxid; struct endrun { char cbuf[ENDRUNLEN]; /* receive buffer */ struct ksensor time; /* the timedelta sensor */ struct ksensor signal; /* signal status */ struct ksensordev timedev; struct timespec ts; /* current timestamp */ struct timespec lts; /* timestamp of last TFOM */ struct timeout endrun_tout; /* invalidate sensor */ int64_t gap; /* gap between two sentences */ int64_t last; /* last time rcvd */ #define SYNC_SCAN 1 /* scanning for '\n' */ #define SYNC_EOL 2 /* '\n' seen, next char TFOM */ int sync; int pos; /* position in rcv buffer */ int no_pps; /* no PPS although requested */ #ifdef ENDRUN_DEBUG char tfom; #endif }; /* EndRun decoding */ void endrun_scan(struct endrun *, struct tty *); void endrun_decode(struct endrun *, struct tty *, char *fld[], int fldcnt); /* date and time conversion */ int endrun_atoi(char *s, int len); int endrun_date_to_nano(char *s1, char *s2, int64_t *nano); int endrun_time_to_nano(char *s, int64_t *nano); int endrun_offset_to_nano(char *s, int64_t *nano); /* degrade the timedelta sensor */ void endrun_timeout(void *); void endrunattach(int dummy) { } int endrunopen(dev_t dev, struct tty *tp, struct proc *p) { struct endrun *np; int error; DPRINTF(("endrunopen\n")); if (tp->t_line == ENDRUNDISC) return ENODEV; if ((error = suser(p)) != 0) return error; np = malloc(sizeof(struct endrun), M_DEVBUF, M_WAITOK|M_ZERO); snprintf(np->timedev.xname, sizeof(np->timedev.xname), "endrun%d", endrun_nxid++); endrun_count++; np->time.status = SENSOR_S_UNKNOWN; np->time.type = SENSOR_TIMEDELTA; #ifndef ENDRUN_DEBUG np->time.flags = SENSOR_FINVALID; #endif sensor_attach(&np->timedev, &np->time); np->signal.type = SENSOR_PERCENT; np->signal.status = SENSOR_S_UNKNOWN; np->signal.value = 100000LL; strlcpy(np->signal.desc, "Signal", sizeof(np->signal.desc)); sensor_attach(&np->timedev, &np->signal); np->sync = SYNC_SCAN; #ifdef ENDRUN_DEBUG np->tfom = '0'; #endif tp->t_sc = (caddr_t)np; error = linesw[TTYDISC].l_open(dev, tp, p); if (error) { free(np, M_DEVBUF, sizeof(*np)); tp->t_sc = NULL; } else { sensordev_install(&np->timedev); timeout_set(&np->endrun_tout, endrun_timeout, np); } return error; } int endrunclose(struct tty *tp, int flags, struct proc *p) { struct endrun *np = (struct endrun *)tp->t_sc; DPRINTF(("endrunclose\n")); tp->t_line = TTYDISC; /* switch back to termios */ timeout_del(&np->endrun_tout); sensordev_deinstall(&np->timedev); free(np, M_DEVBUF, sizeof(*np)); tp->t_sc = NULL; endrun_count--; if (endrun_count == 0) endrun_nxid = 0; return linesw[TTYDISC].l_close(tp, flags, p); } /* collect EndRun sentence from tty */ int endruninput(int c, struct tty *tp) { struct endrun *np = (struct endrun *)tp->t_sc; struct timespec ts; int64_t gap; long tmin, tmax; if (np->sync == SYNC_EOL) { nanotime(&ts); np->pos = 0; np->sync = SYNC_SCAN; np->cbuf[np->pos++] = c; /* TFOM char */ gap = (ts.tv_sec * 1000000000LL + ts.tv_nsec) - (np->lts.tv_sec * 1000000000LL + np->lts.tv_nsec); np->lts.tv_sec = ts.tv_sec; np->lts.tv_nsec = ts.tv_nsec; if (gap <= np->gap) goto nogap; np->ts.tv_sec = ts.tv_sec; np->ts.tv_nsec = ts.tv_nsec; np->gap = gap; /* * If a tty timestamp is available, make sure its value is * reasonable by comparing against the timestamp just taken. * If they differ by more than 2 seconds, assume no PPS signal * is present, note the fact, and keep using the timestamp * value. When this happens, the sensor state is set to * CRITICAL later when the EndRun sentence is decoded. */ if (tp->t_flags & (TS_TSTAMPDCDSET | TS_TSTAMPDCDCLR | TS_TSTAMPCTSSET | TS_TSTAMPCTSCLR)) { tmax = lmax(np->ts.tv_sec, tp->t_tv.tv_sec); tmin = lmin(np->ts.tv_sec, tp->t_tv.tv_sec); if (tmax - tmin > 1) np->no_pps = 1; else { np->ts.tv_sec = tp->t_tv.tv_sec; np->ts.tv_nsec = tp->t_tv.tv_usec * 1000L; np->no_pps = 0; } } } else if (c == '\n') { if (np->pos == ENDRUNLEN - 1) { /* don't copy '\n' into cbuf */ np->cbuf[np->pos] = '\0'; endrun_scan(np, tp); } np->sync = SYNC_EOL; } else { if (np->pos < ENDRUNLEN - 1) np->cbuf[np->pos++] = c; } nogap: /* pass data to termios */ return linesw[TTYDISC].l_rint(c, tp); } /* Scan the EndRun sentence just received */ void endrun_scan(struct endrun *np, struct tty *tp) { int fldcnt = 0, n; char *fld[NUMFLDS], *cs; DPRINTFN(1, ("%s\n", np->cbuf)); /* split into fields */ fld[fldcnt++] = &np->cbuf[0]; for (cs = NULL, n = 0; n < np->pos && cs == NULL; n++) { switch (np->cbuf[n]) { case '\r': np->cbuf[n] = '\0'; cs = &np->cbuf[n + 1]; break; case ' ': if (fldcnt < NUMFLDS) { np->cbuf[n] = '\0'; fld[fldcnt++] = &np->cbuf[n + 1]; } else { DPRINTF(("endrun: nr of fields in sentence " "exceeds expected: %d\n", NUMFLDS)); return; } break; } } endrun_decode(np, tp, fld, fldcnt); } /* Decode the time string */ void endrun_decode(struct endrun *np, struct tty *tp, char *fld[], int fldcnt) { int64_t date_nano, time_nano, offset_nano, endrun_now; char tfom; int jumped = 0; if (fldcnt != NUMFLDS) { DPRINTF(("endrun: field count mismatch, %d\n", fldcnt)); return; } if (endrun_time_to_nano(fld[3], &time_nano) == -1) { DPRINTF(("endrun: illegal time, %s\n", fld[3])); return; } if (endrun_date_to_nano(fld[1], fld[2], &date_nano) == -1) { DPRINTF(("endrun: illegal date, %s %s\n", fld[1], fld[2])); return; } offset_nano = 0; /* only parse offset when timemode is local */ if (fld[5][0] == 'L' && endrun_offset_to_nano(fld[4], &offset_nano) == -1) { DPRINTF(("endrun: illegal offset, %s\n", fld[4])); return; } endrun_now = date_nano + time_nano + offset_nano; if (endrun_now <= np->last) { DPRINTF(("endrun: time not monotonically increasing " "last %lld now %lld\n", (long long)np->last, (long long)endrun_now)); jumped = 1; } np->last = endrun_now; np->gap = 0LL; #ifdef ENDRUN_DEBUG if (np->time.status == SENSOR_S_UNKNOWN) { np->time.status = SENSOR_S_OK; timeout_add_sec(&np->endrun_tout, TRUSTTIME); } #endif np->time.value = np->ts.tv_sec * 1000000000LL + np->ts.tv_nsec - endrun_now; np->time.tv.tv_sec = np->ts.tv_sec; np->time.tv.tv_usec = np->ts.tv_nsec / 1000L; if (np->time.status == SENSOR_S_UNKNOWN) { np->time.status = SENSOR_S_OK; np->time.flags &= ~SENSOR_FINVALID; strlcpy(np->time.desc, "EndRun", sizeof(np->time.desc)); } /* * Only update the timeout if the clock reports the time as valid. * * Time Figure Of Merit (TFOM) values: * * 6 - time error is < 100 us * 7 - time error is < 1 ms * 8 - time error is < 10 ms * 9 - time error is > 10 ms, * unsynchronized state if never locked to CDMA */ switch (tfom = fld[0][0]) { case '6': case '7': case '8': np->time.status = SENSOR_S_OK; np->signal.status = SENSOR_S_OK; break; case '9': np->signal.status = SENSOR_S_WARN; break; default: DPRINTF(("endrun: invalid TFOM: '%c'\n", tfom)); np->signal.status = SENSOR_S_CRIT; break; } #ifdef ENDRUN_DEBUG if (np->tfom != tfom) { DPRINTF(("endrun: TFOM changed from %c to %c\n", np->tfom, tfom)); np->tfom = tfom; } #endif if (jumped) np->time.status = SENSOR_S_WARN; if (np->time.status == SENSOR_S_OK) timeout_add_sec(&np->endrun_tout, TRUSTTIME); /* * If tty timestamping is requested, but no PPS signal is present, set * the sensor state to CRITICAL. */ if (np->no_pps) np->time.status = SENSOR_S_CRIT; } int endrun_atoi(char *s, int len) { int n; char *p; /* make sure the input contains only numbers */ for (n = 0, p = s; n < len && *p && *p >= '0' && *p <= '9'; n++, p++) ; if (n != len || *p != '\0') return -1; for (n = 0; *s; s++) n = n * 10 + *s - '0'; return n; } /* * Convert date fields from EndRun to nanoseconds since the epoch. * The year string must be of the form YYYY . * The day of year string must be of the form DDD . * Return 0 on success, -1 if illegal characters are encountered. */ int endrun_date_to_nano(char *y, char *doy, int64_t *nano) { struct clock_ymdhms clock; time_t secs; int n, i; int year_days = 365; int month_days[] = { 0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; #define FEBRUARY 2 #define LEAPYEAR(x) \ ((x) % 4 == 0 && \ (x) % 100 != 0) || \ (x) % 400 == 0 if ((n = endrun_atoi(y, 4)) == -1) return -1; clock.dt_year = n; if (LEAPYEAR(n)) { month_days[FEBRUARY]++; year_days++; } if ((n = endrun_atoi(doy, 3)) == -1 || n == 0 || n > year_days) return -1; /* convert day of year to month, day */ for (i = 1; n > month_days[i]; i++) { n -= month_days[i]; } clock.dt_mon = i; clock.dt_day = n; DPRINTFN(1, ("mm/dd %d/%d\n", i, n)); clock.dt_hour = clock.dt_min = clock.dt_sec = 0; secs = clock_ymdhms_to_secs(&clock); *nano = secs * 1000000000LL; return 0; } /* * Convert time field from EndRun to nanoseconds since midnight. * The string must be of the form HH:MM:SS . * Return 0 on success, -1 if illegal characters are encountered. */ int endrun_time_to_nano(char *s, int64_t *nano) { struct clock_ymdhms clock; time_t secs; int n; if (s[2] != ':' || s[5] != ':') return -1; s[2] = '\0'; s[5] = '\0'; if ((n = endrun_atoi(&s[0], 2)) == -1 || n > 23) return -1; clock.dt_hour = n; if ((n = endrun_atoi(&s[3], 2)) == -1 || n > 59) return -1; clock.dt_min = n; if ((n = endrun_atoi(&s[6], 2)) == -1 || n > 60) return -1; clock.dt_sec = n; DPRINTFN(1, ("hh:mm:ss %d:%d:%d\n", (int)clock.dt_hour, (int)clock.dt_min, (int)clock.dt_sec)); secs = clock.dt_hour * 3600 + clock.dt_min * 60 + clock.dt_sec; DPRINTFN(1, ("secs %lu\n", (unsigned long)secs)); *nano = secs * 1000000000LL; return 0; } int endrun_offset_to_nano(char *s, int64_t *nano) { time_t secs; int n; if (!(s[0] == '+' || s[0] == '-')) return -1; if ((n = endrun_atoi(&s[1], 2)) == -1) return -1; secs = n * 30 * 60; *nano = secs * 1000000000LL; if (s[0] == '+') *nano = -*nano; DPRINTFN(1, ("offset secs %lu nanosecs %lld\n", (unsigned long)secs, (long long)*nano)); return 0; } /* * Degrade the sensor state if we received no EndRun string for more than * TRUSTTIME seconds. */ void endrun_timeout(void *xnp) { struct endrun *np = xnp; if (np->time.status == SENSOR_S_OK) { np->time.status = SENSOR_S_WARN; /* * further degrade in TRUSTTIME seconds if no new valid EndRun * strings are received. */ timeout_add_sec(&np->endrun_tout, TRUSTTIME); } else np->time.status = SENSOR_S_CRIT; }
371 371 1 371 371 371 371 1 371 368 368 369 257 255 257 114 3909 3880 3947 3633 219 1 118 115 215 9 6 208 17 120 101 101 101 371 370 370 371 350 23 23 2 18 4 21 17 4 20 17 4 8 12 16 3 12 8 14 6 22 18 3 185 114 74 97 94 185 185 94 10 86 94 96 97 97 97 160 23 31 153 180 180 180 174 7 180 165 2 16 177 3 173 7 180 89 97 185 74 114 185 97 34 74 23 97 97 97 97 97 4 97 92 14 164 63 121 70 48 120 121 55 79 74 60 76 76 199 195 4 199 198 377 76 36 36 295 295 228 76 36 128 1195 3854 1194 3162 3166 3166 3158 3159 3170 3154 4897 4905 3160 6 6 5 5 1 49 50 26 24 168 168 168 168 168 164 9 26 126 22 168 145 25 168 166 168 96 115 167 5 52 38 11 100 168 15 5 14 19 18 3 10 10 7 3 17 17 18 18 32 1 31 31 1 29 15 28 28 6 9 4 10 14 17 3 16 3 16 4 1 1 3 3 1 3 1 2 303 302 282 35 303 35 1 34 28 6 33 1 29 10 2 30 27 1 31 28 28 27 28 23 7 27 7 23 8 12 3 18 28 34 4 4 4 4 4 4 4 699 700 700 13 695 15 1 14 9 14 13 14 11 2 9 7 9 9 9 4 4 4 4 4 4 4 4 32 1 31 31 30 14 27 2 28 23 12 11 22 11 6 4 6 2 2 5 23 6 2 28 121 259 116 116 370 370 371 289 1 112 370 303 221 124 110 221 110 124 221 220 224 224 224 220 221 221 224 13 14 14 4 7 3 8 3 7 10 2 10 1 8 14 5 9 956 352 353 728 1175 3839 830 344 727 1182 3834 18 371 /* $OpenBSD: uvm_map.c,v 1.274 2021/03/26 13:40:05 mpi Exp $ */ /* $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */ /* * Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * uvm_map.c: uvm map operations */ /* #define DEBUG */ /* #define VMMAP_DEBUG */ #include <sys/param.h> #include <sys/systm.h> #include <sys/acct.h> #include <sys/mman.h> #include <sys/proc.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/sysctl.h> #include <sys/signalvar.h> #include <sys/syslog.h> #include <sys/user.h> #include <sys/tracepoint.h> #ifdef SYSVSHM #include <sys/shm.h> #endif #include <uvm/uvm.h> #ifdef DDB #include <uvm/uvm_ddb.h> #endif #include <uvm/uvm_addr.h> vsize_t uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t); int uvm_mapent_isjoinable(struct vm_map*, struct vm_map_entry*, struct vm_map_entry*); struct vm_map_entry *uvm_mapent_merge(struct vm_map*, struct vm_map_entry*, struct vm_map_entry*, struct uvm_map_deadq*); struct vm_map_entry *uvm_mapent_tryjoin(struct vm_map*, struct vm_map_entry*, struct uvm_map_deadq*); struct vm_map_entry *uvm_map_mkentry(struct vm_map*, struct vm_map_entry*, struct vm_map_entry*, vaddr_t, vsize_t, int, struct uvm_map_deadq*, struct vm_map_entry*); struct vm_map_entry *uvm_mapent_alloc(struct vm_map*, int); void uvm_mapent_free(struct vm_map_entry*); void uvm_unmap_kill_entry(struct vm_map*, struct vm_map_entry*); void uvm_unmap_detach_intrsafe(struct uvm_map_deadq *); void uvm_mapent_mkfree(struct vm_map*, struct vm_map_entry*, struct vm_map_entry**, struct uvm_map_deadq*, boolean_t); void uvm_map_pageable_pgon(struct vm_map*, struct vm_map_entry*, struct vm_map_entry*, vaddr_t, vaddr_t); int uvm_map_pageable_wire(struct vm_map*, struct vm_map_entry*, struct vm_map_entry*, vaddr_t, vaddr_t, int); void uvm_map_setup_entries(struct vm_map*); void uvm_map_setup_md(struct vm_map*); void uvm_map_teardown(struct vm_map*); void uvm_map_vmspace_update(struct vm_map*, struct uvm_map_deadq*, int); void uvm_map_kmem_grow(struct vm_map*, struct uvm_map_deadq*, vsize_t, int); void uvm_map_freelist_update_clear(struct vm_map*, struct uvm_map_deadq*); void uvm_map_freelist_update_refill(struct vm_map *, int); void uvm_map_freelist_update(struct vm_map*, struct uvm_map_deadq*, vaddr_t, vaddr_t, vaddr_t, vaddr_t, int); struct vm_map_entry *uvm_map_fix_space(struct vm_map*, struct vm_map_entry*, vaddr_t, vaddr_t, int); int uvm_map_sel_limits(vaddr_t*, vaddr_t*, vsize_t, int, struct vm_map_entry*, vaddr_t, vaddr_t, vaddr_t, int); int uvm_map_findspace(struct vm_map*, struct vm_map_entry**, struct vm_map_entry**, vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, vaddr_t); vsize_t uvm_map_addr_augment_get(struct vm_map_entry*); void uvm_map_addr_augment(struct vm_map_entry*); int uvm_map_inentry_recheck(u_long, vaddr_t, struct p_inentry *); boolean_t uvm_map_inentry_fix(struct proc *, struct p_inentry *, vaddr_t, int (*)(vm_map_entry_t), u_long); /* * Tree management functions. */ static inline void uvm_mapent_copy(struct vm_map_entry*, struct vm_map_entry*); static inline int uvm_mapentry_addrcmp(const struct vm_map_entry*, const struct vm_map_entry*); void uvm_mapent_free_insert(struct vm_map*, struct uvm_addr_state*, struct vm_map_entry*); void uvm_mapent_free_remove(struct vm_map*, struct uvm_addr_state*, struct vm_map_entry*); void uvm_mapent_addr_insert(struct vm_map*, struct vm_map_entry*); void uvm_mapent_addr_remove(struct vm_map*, struct vm_map_entry*); void uvm_map_splitentry(struct vm_map*, struct vm_map_entry*, struct vm_map_entry*, vaddr_t); vsize_t uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t); /* * uvm_vmspace_fork helper functions. */ struct vm_map_entry *uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t, vsize_t, vm_prot_t, vm_prot_t, struct vm_map_entry*, struct uvm_map_deadq*, int, int); struct vm_map_entry *uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t, vsize_t, vm_prot_t, vm_prot_t, struct vm_map*, struct vm_map_entry*, struct uvm_map_deadq*); struct vm_map_entry *uvm_mapent_forkshared(struct vmspace*, struct vm_map*, struct vm_map*, struct vm_map_entry*, struct uvm_map_deadq*); struct vm_map_entry *uvm_mapent_forkcopy(struct vmspace*, struct vm_map*, struct vm_map*, struct vm_map_entry*, struct uvm_map_deadq*); struct vm_map_entry *uvm_mapent_forkzero(struct vmspace*, struct vm_map*, struct vm_map*, struct vm_map_entry*, struct uvm_map_deadq*); /* * Tree validation. */ #ifdef VMMAP_DEBUG void uvm_tree_assert(struct vm_map*, int, char*, char*, int); #define UVM_ASSERT(map, cond, file, line) \ uvm_tree_assert((map), (cond), #cond, (file), (line)) void uvm_tree_sanity(struct vm_map*, char*, int); void uvm_tree_size_chk(struct vm_map*, char*, int); void vmspace_validate(struct vm_map*); #else #define uvm_tree_sanity(_map, _file, _line) do {} while (0) #define uvm_tree_size_chk(_map, _file, _line) do {} while (0) #define vmspace_validate(_map) do {} while (0) #endif /* * All architectures will have pmap_prefer. */ #ifndef PMAP_PREFER #define PMAP_PREFER_ALIGN() (vaddr_t)PAGE_SIZE #define PMAP_PREFER_OFFSET(off) 0 #define PMAP_PREFER(addr, off) (addr) #endif /* * The kernel map will initially be VM_MAP_KSIZE_INIT bytes. * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes. * * We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size * each time. */ #define VM_MAP_KSIZE_INIT (512 * (vaddr_t)PAGE_SIZE) #define VM_MAP_KSIZE_DELTA (256 * (vaddr_t)PAGE_SIZE) #define VM_MAP_KSIZE_ALLOCMUL 4 /* * When selecting a random free-space block, look at most FSPACE_DELTA blocks * ahead. */ #define FSPACE_DELTA 8 /* * Put allocations adjecent to previous allocations when the free-space tree * is larger than FSPACE_COMPACT entries. * * Alignment and PMAP_PREFER may still cause the entry to not be fully * adjecent. Note that this strategy reduces memory fragmentation (by leaving * a large space before or after the allocation). */ #define FSPACE_COMPACT 128 /* * Make the address selection skip at most this many bytes from the start of * the free space in which the allocation takes place. * * The main idea behind a randomized address space is that an attacker cannot * know where to target his attack. Therefore, the location of objects must be * as random as possible. However, the goal is not to create the most sparse * map that is possible. * FSPACE_MAXOFF pushes the considered range in bytes down to less insane * sizes, thereby reducing the sparseness. The biggest randomization comes * from fragmentation, i.e. FSPACE_COMPACT. */ #define FSPACE_MAXOFF ((vaddr_t)32 * 1024 * 1024) /* * Allow for small gaps in the overflow areas. * Gap size is in bytes and does not have to be a multiple of page-size. */ #define FSPACE_BIASGAP ((vaddr_t)32 * 1024) /* auto-allocate address lower bound */ #define VMMAP_MIN_ADDR PAGE_SIZE #ifdef DEADBEEF0 #define UVMMAP_DEADBEEF ((unsigned long)DEADBEEF0) #else #define UVMMAP_DEADBEEF ((unsigned long)0xdeadd0d0) #endif #ifdef DEBUG int uvm_map_printlocks = 0; #define LPRINTF(_args) \ do { \ if (uvm_map_printlocks) \ printf _args; \ } while (0) #else #define LPRINTF(_args) do {} while (0) #endif static struct mutex uvm_kmapent_mtx; static struct timeval uvm_kmapent_last_warn_time; static struct timeval uvm_kmapent_warn_rate = { 10, 0 }; const char vmmapbsy[] = "vmmapbsy"; /* * pool for vmspace structures. */ struct pool uvm_vmspace_pool; /* * pool for dynamically-allocated map entries. */ struct pool uvm_map_entry_pool; struct pool uvm_map_entry_kmem_pool; /* * This global represents the end of the kernel virtual address * space. If we want to exceed this, we must grow the kernel * virtual address space dynamically. * * Note, this variable is locked by kernel_map's lock. */ vaddr_t uvm_maxkaddr; /* * Locking predicate. */ #define UVM_MAP_REQ_WRITE(_map) \ do { \ if ((_map)->ref_count > 0) { \ if (((_map)->flags & VM_MAP_INTRSAFE) == 0) \ rw_assert_wrlock(&(_map)->lock); \ else \ MUTEX_ASSERT_LOCKED(&(_map)->mtx); \ } \ } while (0) #define vm_map_modflags(map, set, clear) \ do { \ mtx_enter(&(map)->flags_lock); \ (map)->flags = ((map)->flags | (set)) & ~(clear); \ mtx_leave(&(map)->flags_lock); \ } while (0) /* * Tree describing entries by address. * * Addresses are unique. * Entries with start == end may only exist if they are the first entry * (sorted by address) within a free-memory tree. */ static inline int uvm_mapentry_addrcmp(const struct vm_map_entry *e1, const struct vm_map_entry *e2) { return e1->start < e2->start ? -1 : e1->start > e2->start; } /* * Copy mapentry. */ static inline void uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst) { caddr_t csrc, cdst; size_t sz; csrc = (caddr_t)src; cdst = (caddr_t)dst; csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy); cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy); sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) - offsetof(struct vm_map_entry, uvm_map_entry_start_copy); memcpy(cdst, csrc, sz); } /* * Handle free-list insertion. */ void uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr, struct vm_map_entry *entry) { const struct uvm_addr_functions *fun; #ifdef VMMAP_DEBUG vaddr_t min, max, bound; #endif #ifdef VMMAP_DEBUG /* * Boundary check. * Boundaries are folded if they go on the same free list. */ min = VMMAP_FREE_START(entry); max = VMMAP_FREE_END(entry); while (min < max) { bound = uvm_map_boundary(map, min, max); KASSERT(uvm_map_uaddr(map, min) == uaddr); min = bound; } #endif KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0); KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0); UVM_MAP_REQ_WRITE(map); /* Actual insert: forward to uaddr pointer. */ if (uaddr != NULL) { fun = uaddr->uaddr_functions; KDASSERT(fun != NULL); if (fun->uaddr_free_insert != NULL) (*fun->uaddr_free_insert)(map, uaddr, entry); entry->etype |= UVM_ET_FREEMAPPED; } /* Update fspace augmentation. */ uvm_map_addr_augment(entry); } /* * Handle free-list removal. */ void uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr, struct vm_map_entry *entry) { const struct uvm_addr_functions *fun; KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL); KASSERT(uvm_map_uaddr_e(map, entry) == uaddr); UVM_MAP_REQ_WRITE(map); if (uaddr != NULL) { fun = uaddr->uaddr_functions; if (fun->uaddr_free_remove != NULL) (*fun->uaddr_free_remove)(map, uaddr, entry); entry->etype &= ~UVM_ET_FREEMAPPED; } } /* * Handle address tree insertion. */ void uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry) { struct vm_map_entry *res; if (!RBT_CHECK(uvm_map_addr, entry, UVMMAP_DEADBEEF)) panic("uvm_mapent_addr_insert: entry still in addr list"); KDASSERT(entry->start <= entry->end); KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 && (entry->end & (vaddr_t)PAGE_MASK) == 0); TRACEPOINT(uvm, map_insert, entry->start, entry->end, entry->protection, NULL); UVM_MAP_REQ_WRITE(map); res = RBT_INSERT(uvm_map_addr, &map->addr, entry); if (res != NULL) { panic("uvm_mapent_addr_insert: map %p entry %p " "(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision " "with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)", map, entry, entry->start, entry->end, entry->guard, entry->fspace, res, res->start, res->end, res->guard, res->fspace); } } /* * Handle address tree removal. */ void uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry) { struct vm_map_entry *res; TRACEPOINT(uvm, map_remove, entry->start, entry->end, entry->protection, NULL); UVM_MAP_REQ_WRITE(map); res = RBT_REMOVE(uvm_map_addr, &map->addr, entry); if (res != entry) panic("uvm_mapent_addr_remove"); RBT_POISON(uvm_map_addr, entry, UVMMAP_DEADBEEF); } /* * uvm_map_reference: add reference to a map * * XXX check map reference counter lock */ #define uvm_map_reference(_map) \ do { \ map->ref_count++; \ } while (0) /* * Calculate the dused delta. */ vsize_t uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max) { struct vmspace *vm; vsize_t sz; vaddr_t lmax; vaddr_t stack_begin, stack_end; /* Position of stack. */ KASSERT(map->flags & VM_MAP_ISVMSPACE); vm = (struct vmspace *)map; stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); sz = 0; while (min != max) { lmax = max; if (min < stack_begin && lmax > stack_begin) lmax = stack_begin; else if (min < stack_end && lmax > stack_end) lmax = stack_end; if (min >= stack_begin && min < stack_end) { /* nothing */ } else sz += lmax - min; min = lmax; } return sz >> PAGE_SHIFT; } /* * Find the entry describing the given address. */ struct vm_map_entry* uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr) { struct vm_map_entry *iter; iter = RBT_ROOT(uvm_map_addr, atree); while (iter != NULL) { if (iter->start > addr) iter = RBT_LEFT(uvm_map_addr, iter); else if (VMMAP_FREE_END(iter) <= addr) iter = RBT_RIGHT(uvm_map_addr, iter); else return iter; } return NULL; } /* * DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry) * * Push dead entries into a linked list. * Since the linked list abuses the address tree for storage, the entry * may not be linked in a map. * * *head must be initialized to NULL before the first call to this macro. * uvm_unmap_detach(*head, 0) will remove dead entries. */ static inline void dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry) { TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq); } #define DEAD_ENTRY_PUSH(_headptr, _entry) \ dead_entry_push((_headptr), (_entry)) /* * Helper function for uvm_map_findspace_tree. * * Given allocation constraints and pmap constraints, finds the * lowest and highest address in a range that can be used for the * allocation. * * pmap_align and pmap_off are ignored on non-PMAP_PREFER archs. * * * Big chunk of math with a seasoning of dragons. */ int uvm_map_sel_limits(vaddr_t *min, vaddr_t *max, vsize_t sz, int guardpg, struct vm_map_entry *sel, vaddr_t align, vaddr_t pmap_align, vaddr_t pmap_off, int bias) { vaddr_t sel_min, sel_max; #ifdef PMAP_PREFER vaddr_t pmap_min, pmap_max; #endif /* PMAP_PREFER */ #ifdef DIAGNOSTIC int bad; #endif /* DIAGNOSTIC */ sel_min = VMMAP_FREE_START(sel); sel_max = VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0); #ifdef PMAP_PREFER /* * There are two special cases, in which we can satisfy the align * requirement and the pmap_prefer requirement. * - when pmap_off == 0, we always select the largest of the two * - when pmap_off % align == 0 and pmap_align > align, we simply * satisfy the pmap_align requirement and automatically * satisfy the align requirement. */ if (align > PAGE_SIZE && !(pmap_align > align && (pmap_off & (align - 1)) == 0)) { /* * Simple case: only use align. */ sel_min = roundup(sel_min, align); sel_max &= ~(align - 1); if (sel_min > sel_max) return ENOMEM; /* Correct for bias. */ if (sel_max - sel_min > FSPACE_BIASGAP) { if (bias > 0) { sel_min = sel_max - FSPACE_BIASGAP; sel_min = roundup(sel_min, align); } else if (bias < 0) { sel_max = sel_min + FSPACE_BIASGAP; sel_max &= ~(align - 1); } } } else if (pmap_align != 0) { /* * Special case: satisfy both pmap_prefer and * align argument. */ pmap_max = sel_max & ~(pmap_align - 1); pmap_min = sel_min; if (pmap_max < sel_min) return ENOMEM; /* Adjust pmap_min for BIASGAP for top-addr bias. */ if (bias > 0 && pmap_max - pmap_min > FSPACE_BIASGAP) pmap_min = pmap_max - FSPACE_BIASGAP; /* Align pmap_min. */ pmap_min &= ~(pmap_align - 1); if (pmap_min < sel_min) pmap_min += pmap_align; if (pmap_min > pmap_max) return ENOMEM; /* Adjust pmap_max for BIASGAP for bottom-addr bias. */ if (bias < 0 && pmap_max - pmap_min > FSPACE_BIASGAP) { pmap_max = (pmap_min + FSPACE_BIASGAP) & ~(pmap_align - 1); } if (pmap_min > pmap_max) return ENOMEM; /* Apply pmap prefer offset. */ pmap_max |= pmap_off; if (pmap_max > sel_max) pmap_max -= pmap_align; pmap_min |= pmap_off; if (pmap_min < sel_min) pmap_min += pmap_align; /* * Fixup: it's possible that pmap_min and pmap_max * cross each other. In this case, try to find one * address that is allowed. * (This usually happens in biased case.) */ if (pmap_min > pmap_max) { if (pmap_min < sel_max) pmap_max = pmap_min; else if (pmap_max > sel_min) pmap_min = pmap_max; else return ENOMEM; } /* Internal validation. */ KDASSERT(pmap_min <= pmap_max); sel_min = pmap_min; sel_max = pmap_max; } else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP) sel_min = sel_max - FSPACE_BIASGAP; else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP) sel_max = sel_min + FSPACE_BIASGAP; #else if (align > PAGE_SIZE) { sel_min = roundup(sel_min, align); sel_max &= ~(align - 1); if (sel_min > sel_max) return ENOMEM; if (bias != 0 && sel_max - sel_min > FSPACE_BIASGAP) { if (bias > 0) { sel_min = roundup(sel_max - FSPACE_BIASGAP, align); } else { sel_max = (sel_min + FSPACE_BIASGAP) & ~(align - 1); } } } else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP) sel_min = sel_max - FSPACE_BIASGAP; else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP) sel_max = sel_min + FSPACE_BIASGAP; #endif if (sel_min > sel_max) return ENOMEM; #ifdef DIAGNOSTIC bad = 0; /* Lower boundary check. */ if (sel_min < VMMAP_FREE_START(sel)) { printf("sel_min: 0x%lx, but should be at least 0x%lx\n", sel_min, VMMAP_FREE_START(sel)); bad++; } /* Upper boundary check. */ if (sel_max > VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)) { printf("sel_max: 0x%lx, but should be at most 0x%lx\n", sel_max, VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)); bad++; } /* Lower boundary alignment. */ if (align != 0 && (sel_min & (align - 1)) != 0) { printf("sel_min: 0x%lx, not aligned to 0x%lx\n", sel_min, align); bad++; } /* Upper boundary alignment. */ if (align != 0 && (sel_max & (align - 1)) != 0) { printf("sel_max: 0x%lx, not aligned to 0x%lx\n", sel_max, align); bad++; } /* Lower boundary PMAP_PREFER check. */ if (pmap_align != 0 && align == 0 && (sel_min & (pmap_align - 1)) != pmap_off) { printf("sel_min: 0x%lx, aligned to 0x%lx, expected 0x%lx\n", sel_min, sel_min & (pmap_align - 1), pmap_off); bad++; } /* Upper boundary PMAP_PREFER check. */ if (pmap_align != 0 && align == 0 && (sel_max & (pmap_align - 1)) != pmap_off) { printf("sel_max: 0x%lx, aligned to 0x%lx, expected 0x%lx\n", sel_max, sel_max & (pmap_align - 1), pmap_off); bad++; } if (bad) { panic("uvm_map_sel_limits(sz = %lu, guardpg = %c, " "align = 0x%lx, pmap_align = 0x%lx, pmap_off = 0x%lx, " "bias = %d, " "FREE_START(sel) = 0x%lx, FREE_END(sel) = 0x%lx)", sz, (guardpg ? 'T' : 'F'), align, pmap_align, pmap_off, bias, VMMAP_FREE_START(sel), VMMAP_FREE_END(sel)); } #endif /* DIAGNOSTIC */ *min = sel_min; *max = sel_max; return 0; } /* * Test if memory starting at addr with sz bytes is free. * * Fills in *start_ptr and *end_ptr to be the first and last entry describing * the space. * If called with prefilled *start_ptr and *end_ptr, they are to be correct. */ int uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr, struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr, vaddr_t addr, vsize_t sz) { struct uvm_addr_state *free; struct uvm_map_addr *atree; struct vm_map_entry *i, *i_end; if (addr + sz < addr) return 0; /* * Kernel memory above uvm_maxkaddr is considered unavailable. */ if ((map->flags & VM_MAP_ISVMSPACE) == 0) { if (addr + sz > uvm_maxkaddr) return 0; } atree = &map->addr; /* * Fill in first, last, so they point at the entries containing the * first and last address of the range. * Note that if they are not NULL, we don't perform the lookup. */ KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL); if (*start_ptr == NULL) { *start_ptr = uvm_map_entrybyaddr(atree, addr); if (*start_ptr == NULL) return 0; } else KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr)); if (*end_ptr == NULL) { if (VMMAP_FREE_END(*start_ptr) >= addr + sz) *end_ptr = *start_ptr; else { *end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1); if (*end_ptr == NULL) return 0; } } else KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1)); /* Validation. */ KDASSERT(*start_ptr != NULL && *end_ptr != NULL); KDASSERT((*start_ptr)->start <= addr && VMMAP_FREE_END(*start_ptr) > addr && (*end_ptr)->start < addr + sz && VMMAP_FREE_END(*end_ptr) >= addr + sz); /* * Check the none of the entries intersects with <addr, addr+sz>. * Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is * considered unavailable unless called by those allocators. */ i = *start_ptr; i_end = RBT_NEXT(uvm_map_addr, *end_ptr); for (; i != i_end; i = RBT_NEXT(uvm_map_addr, i)) { if (i->start != i->end && i->end > addr) return 0; /* * uaddr_exe and uaddr_brk_stack may only be used * by these allocators and the NULL uaddr (i.e. no * uaddr). * Reject if this requirement is not met. */ if (uaddr != NULL) { free = uvm_map_uaddr_e(map, i); if (uaddr != free && free != NULL && (free == map->uaddr_exe || free == map->uaddr_brk_stack)) return 0; } } return -1; } /* * Invoke each address selector until an address is found. * Will not invoke uaddr_exe. */ int uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first, struct vm_map_entry**last, vaddr_t *addr, vsize_t sz, vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint) { struct uvm_addr_state *uaddr; int i; /* * Allocation for sz bytes at any address, * using the addr selectors in order. */ for (i = 0; i < nitems(map->uaddr_any); i++) { uaddr = map->uaddr_any[i]; if (uvm_addr_invoke(map, uaddr, first, last, addr, sz, pmap_align, pmap_offset, prot, hint) == 0) return 0; } /* Fall back to brk() and stack() address selectors. */ uaddr = map->uaddr_brk_stack; if (uvm_addr_invoke(map, uaddr, first, last, addr, sz, pmap_align, pmap_offset, prot, hint) == 0) return 0; return ENOMEM; } /* Calculate entry augmentation value. */ vsize_t uvm_map_addr_augment_get(struct vm_map_entry *entry) { vsize_t augment; struct vm_map_entry *left, *right; augment = entry->fspace; if ((left = RBT_LEFT(uvm_map_addr, entry)) != NULL) augment = MAX(augment, left->fspace_augment); if ((right = RBT_RIGHT(uvm_map_addr, entry)) != NULL) augment = MAX(augment, right->fspace_augment); return augment; } /* * Update augmentation data in entry. */ void uvm_map_addr_augment(struct vm_map_entry *entry) { vsize_t augment; while (entry != NULL) { /* Calculate value for augmentation. */ augment = uvm_map_addr_augment_get(entry); /* * Descend update. * Once we find an entry that already has the correct value, * stop, since it means all its parents will use the correct * value too. */ if (entry->fspace_augment == augment) return; entry->fspace_augment = augment; entry = RBT_PARENT(uvm_map_addr, entry); } } /* * uvm_mapanon: establish a valid mapping in map for an anon * * => *addr and sz must be a multiple of PAGE_SIZE. * => *addr is ignored, except if flags contains UVM_FLAG_FIXED. * => map must be unlocked. * * => align: align vaddr, must be a power-of-2. * Align is only a hint and will be ignored if the alignment fails. */ int uvm_mapanon(struct vm_map *map, vaddr_t *addr, vsize_t sz, vsize_t align, unsigned int flags) { struct vm_map_entry *first, *last, *entry, *new; struct uvm_map_deadq dead; vm_prot_t prot; vm_prot_t maxprot; vm_inherit_t inherit; int advice; int error; vaddr_t pmap_align, pmap_offset; vaddr_t hint; KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE); KASSERT(map != kernel_map); KASSERT((map->flags & UVM_FLAG_HOLE) == 0); KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); splassert(IPL_NONE); KASSERT((flags & UVM_FLAG_TRYLOCK) == 0); /* * We use pmap_align and pmap_offset as alignment and offset variables. * * Because the align parameter takes precedence over pmap prefer, * the pmap_align will need to be set to align, with pmap_offset = 0, * if pmap_prefer will not align. */ pmap_align = MAX(align, PAGE_SIZE); pmap_offset = 0; /* Decode parameters. */ prot = UVM_PROTECTION(flags); maxprot = UVM_MAXPROTECTION(flags); advice = UVM_ADVICE(flags); inherit = UVM_INHERIT(flags); error = 0; hint = trunc_page(*addr); TAILQ_INIT(&dead); KASSERT((sz & (vaddr_t)PAGE_MASK) == 0); KASSERT((align & (align - 1)) == 0); /* Check protection. */ if ((prot & maxprot) != prot) return EACCES; /* * Before grabbing the lock, allocate a map entry for later * use to ensure we don't wait for memory while holding the * vm_map_lock. */ new = uvm_mapent_alloc(map, flags); if (new == NULL) return ENOMEM; vm_map_lock(map); first = last = NULL; if (flags & UVM_FLAG_FIXED) { /* * Fixed location. * * Note: we ignore align, pmap_prefer. * Fill in first, last and *addr. */ KASSERT((*addr & PAGE_MASK) == 0); /* Check that the space is available. */ if (flags & UVM_FLAG_UNMAP) { if ((flags & UVM_FLAG_STACK) && !uvm_map_is_stack_remappable(map, *addr, sz)) { error = EINVAL; goto unlock; } uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE); } if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) { error = ENOMEM; goto unlock; } } else if (*addr != 0 && (*addr & PAGE_MASK) == 0 && (align == 0 || (*addr & (align - 1)) == 0) && uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) { /* * Address used as hint. * * Note: we enforce the alignment restriction, * but ignore pmap_prefer. */ } else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) { /* Run selection algorithm for executables. */ error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last, addr, sz, pmap_align, pmap_offset, prot, hint); if (error != 0) goto unlock; } else { /* Update freelists from vmspace. */ uvm_map_vmspace_update(map, &dead, flags); error = uvm_map_findspace(map, &first, &last, addr, sz, pmap_align, pmap_offset, prot, hint); if (error != 0) goto unlock; } /* Double-check if selected address doesn't cause overflow. */ if (*addr + sz < *addr) { error = ENOMEM; goto unlock; } /* If we only want a query, return now. */ if (flags & UVM_FLAG_QUERY) { error = 0; goto unlock; } /* * Create new entry. * first and last may be invalidated after this call. */ entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead, new); if (entry == NULL) { error = ENOMEM; goto unlock; } new = NULL; KDASSERT(entry->start == *addr && entry->end == *addr + sz); entry->object.uvm_obj = NULL; entry->offset = 0; entry->protection = prot; entry->max_protection = maxprot; entry->inheritance = inherit; entry->wired_count = 0; entry->advice = advice; if (prot & PROT_WRITE) map->wserial++; if (flags & UVM_FLAG_SYSCALL) { entry->etype |= UVM_ET_SYSCALL; map->wserial++; } if (flags & UVM_FLAG_STACK) { entry->etype |= UVM_ET_STACK; if (flags & (UVM_FLAG_FIXED | UVM_FLAG_UNMAP)) map->sserial++; } if (flags & UVM_FLAG_COPYONW) { entry->etype |= UVM_ET_COPYONWRITE; if ((flags & UVM_FLAG_OVERLAY) == 0) entry->etype |= UVM_ET_NEEDSCOPY; } if (flags & UVM_FLAG_CONCEAL) entry->etype |= UVM_ET_CONCEAL; if (flags & UVM_FLAG_OVERLAY) { entry->aref.ar_pageoff = 0; entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0); } /* Update map and process statistics. */ map->size += sz; if (prot != PROT_NONE) { ((struct vmspace *)map)->vm_dused += uvmspace_dused(map, *addr, *addr + sz); } unlock: vm_map_unlock(map); /* * Remove dead entries. * * Dead entries may be the result of merging. * uvm_map_mkentry may also create dead entries, when it attempts to * destroy free-space entries. */ uvm_unmap_detach(&dead, 0); if (new) uvm_mapent_free(new); return error; } /* * uvm_map: establish a valid mapping in map * * => *addr and sz must be a multiple of PAGE_SIZE. * => map must be unlocked. * => <uobj,uoffset> value meanings (4 cases): * [1] <NULL,uoffset> == uoffset is a hint for PMAP_PREFER * [2] <NULL,UVM_UNKNOWN_OFFSET> == don't PMAP_PREFER * [3] <uobj,uoffset> == normal mapping * [4] <uobj,UVM_UNKNOWN_OFFSET> == uvm_map finds offset based on VA * * case [4] is for kernel mappings where we don't know the offset until * we've found a virtual address. note that kernel object offsets are * always relative to vm_map_min(kernel_map). * * => align: align vaddr, must be a power-of-2. * Align is only a hint and will be ignored if the alignment fails. */ int uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz, struct uvm_object *uobj, voff_t uoffset, vsize_t align, unsigned int flags) { struct vm_map_entry *first, *last, *entry, *new; struct uvm_map_deadq dead; vm_prot_t prot; vm_prot_t maxprot; vm_inherit_t inherit; int advice; int error; vaddr_t pmap_align, pmap_offset; vaddr_t hint; if ((map->flags & VM_MAP_INTRSAFE) == 0) splassert(IPL_NONE); else splassert(IPL_VM); /* * We use pmap_align and pmap_offset as alignment and offset variables. * * Because the align parameter takes precedence over pmap prefer, * the pmap_align will need to be set to align, with pmap_offset = 0, * if pmap_prefer will not align. */ if (uoffset == UVM_UNKNOWN_OFFSET) { pmap_align = MAX(align, PAGE_SIZE); pmap_offset = 0; } else { pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE); pmap_offset = PMAP_PREFER_OFFSET(uoffset); if (align == 0 || (align <= pmap_align && (pmap_offset & (align - 1)) == 0)) { /* pmap_offset satisfies align, no change. */ } else { /* Align takes precedence over pmap prefer. */ pmap_align = align; pmap_offset = 0; } } /* Decode parameters. */ prot = UVM_PROTECTION(flags); maxprot = UVM_MAXPROTECTION(flags); advice = UVM_ADVICE(flags); inherit = UVM_INHERIT(flags); error = 0; hint = trunc_page(*addr); TAILQ_INIT(&dead); KASSERT((sz & (vaddr_t)PAGE_MASK) == 0); KASSERT((align & (align - 1)) == 0); /* Holes are incompatible with other types of mappings. */ if (flags & UVM_FLAG_HOLE) { KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) && (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0); } /* Unset hint for kernel_map non-fixed allocations. */ if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED)) hint = 0; /* Check protection. */ if ((prot & maxprot) != prot) return EACCES; if (map == kernel_map && (prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) panic("uvm_map: kernel map W^X violation requested"); /* * Before grabbing the lock, allocate a map entry for later * use to ensure we don't wait for memory while holding the * vm_map_lock. */ new = uvm_mapent_alloc(map, flags); if (new == NULL) return ENOMEM; if (flags & UVM_FLAG_TRYLOCK) { if (vm_map_lock_try(map) == FALSE) { error = EFAULT; goto out; } } else { vm_map_lock(map); } first = last = NULL; if (flags & UVM_FLAG_FIXED) { /* * Fixed location. * * Note: we ignore align, pmap_prefer. * Fill in first, last and *addr. */ KASSERT((*addr & PAGE_MASK) == 0); /* * Grow pmap to include allocated address. * If the growth fails, the allocation will fail too. */ if ((map->flags & VM_MAP_ISVMSPACE) == 0 && uvm_maxkaddr < (*addr + sz)) { uvm_map_kmem_grow(map, &dead, *addr + sz - uvm_maxkaddr, flags); } /* Check that the space is available. */ if (flags & UVM_FLAG_UNMAP) uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE); if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) { error = ENOMEM; goto unlock; } } else if (*addr != 0 && (*addr & PAGE_MASK) == 0 && (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE && (align == 0 || (*addr & (align - 1)) == 0) && uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) { /* * Address used as hint. * * Note: we enforce the alignment restriction, * but ignore pmap_prefer. */ } else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) { /* Run selection algorithm for executables. */ error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last, addr, sz, pmap_align, pmap_offset, prot, hint); /* Grow kernel memory and try again. */ if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) { uvm_map_kmem_grow(map, &dead, sz, flags); error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last, addr, sz, pmap_align, pmap_offset, prot, hint); } if (error != 0) goto unlock; } else { /* Update freelists from vmspace. */ if (map->flags & VM_MAP_ISVMSPACE) uvm_map_vmspace_update(map, &dead, flags); error = uvm_map_findspace(map, &first, &last, addr, sz, pmap_align, pmap_offset, prot, hint); /* Grow kernel memory and try again. */ if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) { uvm_map_kmem_grow(map, &dead, sz, flags); error = uvm_map_findspace(map, &first, &last, addr, sz, pmap_align, pmap_offset, prot, hint); } if (error != 0) goto unlock; } /* Double-check if selected address doesn't cause overflow. */ if (*addr + sz < *addr) { error = ENOMEM; goto unlock; } KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE || uvm_maxkaddr >= *addr + sz); /* If we only want a query, return now. */ if (flags & UVM_FLAG_QUERY) { error = 0; goto unlock; } if (uobj == NULL) uoffset = 0; else if (uoffset == UVM_UNKNOWN_OFFSET) { KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj)); uoffset = *addr - vm_map_min(kernel_map); } /* * Create new entry. * first and last may be invalidated after this call. */ entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead, new); if (entry == NULL) { error = ENOMEM; goto unlock; } new = NULL; KDASSERT(entry->start == *addr && entry->end == *addr + sz); entry->object.uvm_obj = uobj; entry->offset = uoffset; entry->protection = prot; entry->max_protection = maxprot; entry->inheritance = inherit; entry->wired_count = 0; entry->advice = advice; if (prot & PROT_WRITE) map->wserial++; if (flags & UVM_FLAG_SYSCALL) { entry->etype |= UVM_ET_SYSCALL; map->wserial++; } if (flags & UVM_FLAG_STACK) { entry->etype |= UVM_ET_STACK; if (flags & UVM_FLAG_UNMAP) map->sserial++; } if (uobj) entry->etype |= UVM_ET_OBJ; else if (flags & UVM_FLAG_HOLE) entry->etype |= UVM_ET_HOLE; if (flags & UVM_FLAG_NOFAULT) entry->etype |= UVM_ET_NOFAULT; if (flags & UVM_FLAG_WC) entry->etype |= UVM_ET_WC; if (flags & UVM_FLAG_COPYONW) { entry->etype |= UVM_ET_COPYONWRITE; if ((flags & UVM_FLAG_OVERLAY) == 0) entry->etype |= UVM_ET_NEEDSCOPY; } if (flags & UVM_FLAG_CONCEAL) entry->etype |= UVM_ET_CONCEAL; if (flags & UVM_FLAG_OVERLAY) { entry->aref.ar_pageoff = 0; entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0); } /* Update map and process statistics. */ if (!(flags & UVM_FLAG_HOLE)) { map->size += sz; if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL && prot != PROT_NONE) { ((struct vmspace *)map)->vm_dused += uvmspace_dused(map, *addr, *addr + sz); } } /* * Try to merge entry. * * Userland allocations are kept separated most of the time. * Forego the effort of merging what most of the time can't be merged * and only try the merge if it concerns a kernel entry. */ if ((flags & UVM_FLAG_NOMERGE) == 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) uvm_mapent_tryjoin(map, entry, &dead); unlock: vm_map_unlock(map); /* * Remove dead entries. * * Dead entries may be the result of merging. * uvm_map_mkentry may also create dead entries, when it attempts to * destroy free-space entries. */ if (map->flags & VM_MAP_INTRSAFE) uvm_unmap_detach_intrsafe(&dead); else uvm_unmap_detach(&dead, 0); out: if (new) uvm_mapent_free(new); return error; } /* * True iff e1 and e2 can be joined together. */ int uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1, struct vm_map_entry *e2) { KDASSERT(e1 != NULL && e2 != NULL); /* Must be the same entry type and not have free memory between. */ if (e1->etype != e2->etype || e1->end != e2->start) return 0; /* Submaps are never joined. */ if (UVM_ET_ISSUBMAP(e1)) return 0; /* Never merge wired memory. */ if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2)) return 0; /* Protection, inheritance and advice must be equal. */ if (e1->protection != e2->protection || e1->max_protection != e2->max_protection || e1->inheritance != e2->inheritance || e1->advice != e2->advice) return 0; /* If uvm_object: object itself and offsets within object must match. */ if (UVM_ET_ISOBJ(e1)) { if (e1->object.uvm_obj != e2->object.uvm_obj) return 0; if (e1->offset + (e1->end - e1->start) != e2->offset) return 0; } /* * Cannot join shared amaps. * Note: no need to lock amap to look at refs, since we don't care * about its exact value. * If it is 1 (i.e. we have the only reference) it will stay there. */ if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1) return 0; if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1) return 0; /* Apparently, e1 and e2 match. */ return 1; } /* * Join support function. * * Returns the merged entry on success. * Returns NULL if the merge failed. */ struct vm_map_entry* uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1, struct vm_map_entry *e2, struct uvm_map_deadq *dead) { struct uvm_addr_state *free; /* * Merging is not supported for map entries that * contain an amap in e1. This should never happen * anyway, because only kernel entries are merged. * These do not contain amaps. * e2 contains no real information in its amap, * so it can be erased immediately. */ KASSERT(e1->aref.ar_amap == NULL); /* * Don't drop obj reference: * uvm_unmap_detach will do this for us. */ free = uvm_map_uaddr_e(map, e1); uvm_mapent_free_remove(map, free, e1); free = uvm_map_uaddr_e(map, e2); uvm_mapent_free_remove(map, free, e2); uvm_mapent_addr_remove(map, e2); e1->end = e2->end; e1->guard = e2->guard; e1->fspace = e2->fspace; uvm_mapent_free_insert(map, free, e1); DEAD_ENTRY_PUSH(dead, e2); return e1; } /* * Attempt forward and backward joining of entry. * * Returns entry after joins. * We are guaranteed that the amap of entry is either non-existent or * has never been used. */ struct vm_map_entry* uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry, struct uvm_map_deadq *dead) { struct vm_map_entry *other; struct vm_map_entry *merged; /* Merge with previous entry. */ other = RBT_PREV(uvm_map_addr, entry); if (other && uvm_mapent_isjoinable(map, other, entry)) { merged = uvm_mapent_merge(map, other, entry, dead); if (merged) entry = merged; } /* * Merge with next entry. * * Because amap can only extend forward and the next entry * probably contains sensible info, only perform forward merging * in the absence of an amap. */ other = RBT_NEXT(uvm_map_addr, entry); if (other && entry->aref.ar_amap == NULL && other->aref.ar_amap == NULL && uvm_mapent_isjoinable(map, entry, other)) { merged = uvm_mapent_merge(map, entry, other, dead); if (merged) entry = merged; } return entry; } /* * Kill entries that are no longer in a map. */ void uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags) { struct vm_map_entry *entry, *tmp; int waitok = flags & UVM_PLA_WAITOK; TAILQ_FOREACH_SAFE(entry, deadq, dfree.deadq, tmp) { /* Skip entries for which we have to grab the kernel lock. */ if (entry->aref.ar_amap || UVM_ET_ISSUBMAP(entry) || UVM_ET_ISOBJ(entry)) continue; TAILQ_REMOVE(deadq, entry, dfree.deadq); uvm_mapent_free(entry); } if (TAILQ_EMPTY(deadq)) return; KERNEL_LOCK(); while ((entry = TAILQ_FIRST(deadq)) != NULL) { if (waitok) uvm_pause(); /* Drop reference to amap, if we've got one. */ if (entry->aref.ar_amap) amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff, atop(entry->end - entry->start), flags & AMAP_REFALL); /* Drop reference to our backing object, if we've got one. */ if (UVM_ET_ISSUBMAP(entry)) { /* ... unlikely to happen, but play it safe */ uvm_map_deallocate(entry->object.sub_map); } else if (UVM_ET_ISOBJ(entry) && entry->object.uvm_obj->pgops->pgo_detach) { entry->object.uvm_obj->pgops->pgo_detach( entry->object.uvm_obj); } /* Step to next. */ TAILQ_REMOVE(deadq, entry, dfree.deadq); uvm_mapent_free(entry); } KERNEL_UNLOCK(); } void uvm_unmap_detach_intrsafe(struct uvm_map_deadq *deadq) { struct vm_map_entry *entry; while ((entry = TAILQ_FIRST(deadq)) != NULL) { KASSERT(entry->aref.ar_amap == NULL); KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(!UVM_ET_ISOBJ(entry)); TAILQ_REMOVE(deadq, entry, dfree.deadq); uvm_mapent_free(entry); } } /* * Create and insert new entry. * * Returned entry contains new addresses and is inserted properly in the tree. * first and last are (probably) no longer valid. */ struct vm_map_entry* uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first, struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags, struct uvm_map_deadq *dead, struct vm_map_entry *new) { struct vm_map_entry *entry, *prev; struct uvm_addr_state *free; vaddr_t min, max; /* free space boundaries for new entry */ KDASSERT(map != NULL); KDASSERT(first != NULL); KDASSERT(last != NULL); KDASSERT(dead != NULL); KDASSERT(sz > 0); KDASSERT(addr + sz > addr); KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr); KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz); KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz)); uvm_tree_sanity(map, __FILE__, __LINE__); min = addr + sz; max = VMMAP_FREE_END(last); /* Initialize new entry. */ if (new == NULL) entry = uvm_mapent_alloc(map, flags); else entry = new; if (entry == NULL) return NULL; entry->offset = 0; entry->etype = 0; entry->wired_count = 0; entry->aref.ar_pageoff = 0; entry->aref.ar_amap = NULL; entry->start = addr; entry->end = min; entry->guard = 0; entry->fspace = 0; /* Reset free space in first. */ free = uvm_map_uaddr_e(map, first); uvm_mapent_free_remove(map, free, first); first->guard = 0; first->fspace = 0; /* * Remove all entries that are fully replaced. * We are iterating using last in reverse order. */ for (; first != last; last = prev) { prev = RBT_PREV(uvm_map_addr, last); KDASSERT(last->start == last->end); free = uvm_map_uaddr_e(map, last); uvm_mapent_free_remove(map, free, last); uvm_mapent_addr_remove(map, last); DEAD_ENTRY_PUSH(dead, last); } /* Remove first if it is entirely inside <addr, addr+sz>. */ if (first->start == addr) { uvm_mapent_addr_remove(map, first); DEAD_ENTRY_PUSH(dead, first); } else { uvm_map_fix_space(map, first, VMMAP_FREE_START(first), addr, flags); } /* Finally, link in entry. */ uvm_mapent_addr_insert(map, entry); uvm_map_fix_space(map, entry, min, max, flags); uvm_tree_sanity(map, __FILE__, __LINE__); return entry; } /* * uvm_mapent_alloc: allocate a map entry */ struct vm_map_entry * uvm_mapent_alloc(struct vm_map *map, int flags) { struct vm_map_entry *me, *ne; int pool_flags; int i; pool_flags = PR_WAITOK; if (flags & UVM_FLAG_TRYLOCK) pool_flags = PR_NOWAIT; if (map->flags & VM_MAP_INTRSAFE || cold) { mtx_enter(&uvm_kmapent_mtx); if (SLIST_EMPTY(&uvm.kentry_free)) { ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty, &kd_nowait); if (ne == NULL) panic("uvm_mapent_alloc: cannot allocate map " "entry"); for (i = 0; i < PAGE_SIZE / sizeof(*ne); i++) { SLIST_INSERT_HEAD(&uvm.kentry_free, &ne[i], daddrs.addr_kentry); } if (ratecheck(&uvm_kmapent_last_warn_time, &uvm_kmapent_warn_rate)) printf("uvm_mapent_alloc: out of static " "map entries\n"); } me = SLIST_FIRST(&uvm.kentry_free); SLIST_REMOVE_HEAD(&uvm.kentry_free, daddrs.addr_kentry); uvmexp.kmapent++; mtx_leave(&uvm_kmapent_mtx); me->flags = UVM_MAP_STATIC; } else if (map == kernel_map) { splassert(IPL_NONE); me = pool_get(&uvm_map_entry_kmem_pool, pool_flags); if (me == NULL) goto out; me->flags = UVM_MAP_KMEM; } else { splassert(IPL_NONE); me = pool_get(&uvm_map_entry_pool, pool_flags); if (me == NULL) goto out; me->flags = 0; } RBT_POISON(uvm_map_addr, me, UVMMAP_DEADBEEF); out: return me; } /* * uvm_mapent_free: free map entry * * => XXX: static pool for kernel map? */ void uvm_mapent_free(struct vm_map_entry *me) { if (me->flags & UVM_MAP_STATIC) { mtx_enter(&uvm_kmapent_mtx); SLIST_INSERT_HEAD(&uvm.kentry_free, me, daddrs.addr_kentry); uvmexp.kmapent--; mtx_leave(&uvm_kmapent_mtx); } else if (me->flags & UVM_MAP_KMEM) { splassert(IPL_NONE); pool_put(&uvm_map_entry_kmem_pool, me); } else { splassert(IPL_NONE); pool_put(&uvm_map_entry_pool, me); } } /* * uvm_map_lookup_entry: find map entry at or before an address. * * => map must at least be read-locked by caller * => entry is returned in "entry" * => return value is true if address is in the returned entry * ET_HOLE entries are considered to not contain a mapping, ergo FALSE is * returned for those mappings. */ boolean_t uvm_map_lookup_entry(struct vm_map *map, vaddr_t address, struct vm_map_entry **entry) { *entry = uvm_map_entrybyaddr(&map->addr, address); return *entry != NULL && !UVM_ET_ISHOLE(*entry) && (*entry)->start <= address && (*entry)->end > address; } /* * Stack must be in a MAP_STACK entry. PROT_NONE indicates stack not yet * grown -- then uvm_map_check_region_range() should not cache the entry * because growth won't be seen. */ int uvm_map_inentry_sp(vm_map_entry_t entry) { if ((entry->etype & UVM_ET_STACK) == 0) { if (entry->protection == PROT_NONE) return (-1); /* don't update range */ return (0); } return (1); } /* * The system call must not come from a writeable entry, W^X is violated. * (Would be nice if we can spot aliasing, which is also kind of bad) * * The system call must come from an syscall-labeled entry (which are * the text regions of the main program, sigtramp, ld.so, or libc). */ int uvm_map_inentry_pc(vm_map_entry_t entry) { if (entry->protection & PROT_WRITE) return (0); /* not permitted */ if ((entry->etype & UVM_ET_SYSCALL) == 0) return (0); /* not permitted */ return (1); } int uvm_map_inentry_recheck(u_long serial, vaddr_t addr, struct p_inentry *ie) { return (serial != ie->ie_serial || ie->ie_start == 0 || addr < ie->ie_start || addr >= ie->ie_end); } /* * Inside a vm_map find the reg address and verify it via function. * Remember low and high addresses of region if valid and return TRUE, * else return FALSE. */ boolean_t uvm_map_inentry_fix(struct proc *p, struct p_inentry *ie, vaddr_t addr, int (*fn)(vm_map_entry_t), u_long serial) { vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; int ret; if (addr < map->min_offset || addr >= map->max_offset) return (FALSE); /* lock map */ vm_map_lock_read(map); /* lookup */ if (!uvm_map_lookup_entry(map, trunc_page(addr), &entry)) { vm_map_unlock_read(map); return (FALSE); } ret = (*fn)(entry); if (ret == 0) { vm_map_unlock_read(map); return (FALSE); } else if (ret == 1) { ie->ie_start = entry->start; ie->ie_end = entry->end; ie->ie_serial = serial; } else { /* do not update, re-check later */ } vm_map_unlock_read(map); return (TRUE); } boolean_t uvm_map_inentry(struct proc *p, struct p_inentry *ie, vaddr_t addr, const char *fmt, int (*fn)(vm_map_entry_t), u_long serial) { union sigval sv; boolean_t ok = TRUE; if (uvm_map_inentry_recheck(serial, addr, ie)) { ok = uvm_map_inentry_fix(p, ie, addr, fn, serial); if (!ok) { KERNEL_LOCK(); printf(fmt, p->p_p->ps_comm, p->p_p->ps_pid, p->p_tid, addr, ie->ie_start, ie->ie_end); p->p_p->ps_acflag |= AMAP; sv.sival_ptr = (void *)PROC_PC(p); trapsignal(p, SIGSEGV, 0, SEGV_ACCERR, sv); KERNEL_UNLOCK(); } } return (ok); } /* * Check whether the given address range can be converted to a MAP_STACK * mapping. * * Must be called with map locked. */ boolean_t uvm_map_is_stack_remappable(struct vm_map *map, vaddr_t addr, vaddr_t sz) { vaddr_t end = addr + sz; struct vm_map_entry *first, *iter, *prev = NULL; if (!uvm_map_lookup_entry(map, addr, &first)) { printf("map stack 0x%lx-0x%lx of map %p failed: no mapping\n", addr, end, map); return FALSE; } /* * Check that the address range exists and is contiguous. */ for (iter = first; iter != NULL && iter->start < end; prev = iter, iter = RBT_NEXT(uvm_map_addr, iter)) { /* * Make sure that we do not have holes in the range. */ #if 0 if (prev != NULL) { printf("prev->start 0x%lx, prev->end 0x%lx, " "iter->start 0x%lx, iter->end 0x%lx\n", prev->start, prev->end, iter->start, iter->end); } #endif if (prev != NULL && prev->end != iter->start) { printf("map stack 0x%lx-0x%lx of map %p failed: " "hole in range\n", addr, end, map); return FALSE; } if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) { printf("map stack 0x%lx-0x%lx of map %p failed: " "hole in range\n", addr, end, map); return FALSE; } } return TRUE; } /* * Remap the middle-pages of an existing mapping as a stack range. * If there exists a previous contiguous mapping with the given range * [addr, addr + sz), with protection PROT_READ|PROT_WRITE, then the * mapping is dropped, and a new anon mapping is created and marked as * a stack. * * Must be called with map unlocked. */ int uvm_map_remap_as_stack(struct proc *p, vaddr_t addr, vaddr_t sz) { vm_map_t map = &p->p_vmspace->vm_map; vaddr_t start, end; int error; int flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_INHERIT_COPY, MADV_NORMAL, UVM_FLAG_STACK | UVM_FLAG_FIXED | UVM_FLAG_UNMAP | UVM_FLAG_COPYONW); start = round_page(addr); end = trunc_page(addr + sz); #ifdef MACHINE_STACK_GROWS_UP if (end == addr + sz) end -= PAGE_SIZE; #else if (start == addr) start += PAGE_SIZE; #endif if (start < map->min_offset || end >= map->max_offset || end < start) return EINVAL; error = uvm_mapanon(map, &start, end - start, 0, flags); if (error != 0) printf("map stack for pid %d failed\n", p->p_p->ps_pid); return error; } /* * uvm_map_pie: return a random load address for a PIE executable * properly aligned. */ #ifndef VM_PIE_MAX_ADDR #define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4) #endif #ifndef VM_PIE_MIN_ADDR #define VM_PIE_MIN_ADDR VM_MIN_ADDRESS #endif #ifndef VM_PIE_MIN_ALIGN #define VM_PIE_MIN_ALIGN PAGE_SIZE #endif vaddr_t uvm_map_pie(vaddr_t align) { vaddr_t addr, space, min; align = MAX(align, VM_PIE_MIN_ALIGN); /* round up to next alignment */ min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1); if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR) return (align); space = (VM_PIE_MAX_ADDR - min) / align; space = MIN(space, (u_int32_t)-1); addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align; addr += min; return (addr); } void uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end) { struct uvm_map_deadq dead; KASSERT((start & (vaddr_t)PAGE_MASK) == 0 && (end & (vaddr_t)PAGE_MASK) == 0); TAILQ_INIT(&dead); vm_map_lock(map); uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE); vm_map_unlock(map); if (map->flags & VM_MAP_INTRSAFE) uvm_unmap_detach_intrsafe(&dead); else uvm_unmap_detach(&dead, 0); } /* * Mark entry as free. * * entry will be put on the dead list. * The free space will be merged into the previous or a new entry, * unless markfree is false. */ void uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry, struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead, boolean_t markfree) { struct uvm_addr_state *free; struct vm_map_entry *prev; vaddr_t addr; /* Start of freed range. */ vaddr_t end; /* End of freed range. */ prev = *prev_ptr; if (prev == entry) *prev_ptr = prev = NULL; if (prev == NULL || VMMAP_FREE_END(prev) != entry->start) prev = RBT_PREV(uvm_map_addr, entry); /* Entry is describing only free memory and has nothing to drain into. */ if (prev == NULL && entry->start == entry->end && markfree) { *prev_ptr = entry; return; } addr = entry->start; end = VMMAP_FREE_END(entry); free = uvm_map_uaddr_e(map, entry); uvm_mapent_free_remove(map, free, entry); uvm_mapent_addr_remove(map, entry); DEAD_ENTRY_PUSH(dead, entry); if (markfree) { if (prev) { free = uvm_map_uaddr_e(map, prev); uvm_mapent_free_remove(map, free, prev); } *prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0); } } /* * Unwire and release referenced amap and object from map entry. */ void uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry) { /* Unwire removed map entry. */ if (VM_MAPENT_ISWIRED(entry)) { KERNEL_LOCK(); entry->wired_count = 0; uvm_fault_unwire_locked(map, entry->start, entry->end); KERNEL_UNLOCK(); } /* Entry-type specific code. */ if (UVM_ET_ISHOLE(entry)) { /* Nothing to be done for holes. */ } else if (map->flags & VM_MAP_INTRSAFE) { KASSERT(vm_map_pmap(map) == pmap_kernel()); uvm_km_pgremove_intrsafe(entry->start, entry->end); pmap_kremove(entry->start, entry->end - entry->start); } else if (UVM_ET_ISOBJ(entry) && UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) { KASSERT(vm_map_pmap(map) == pmap_kernel()); /* * Note: kernel object mappings are currently used in * two ways: * [1] "normal" mappings of pages in the kernel object * [2] uvm_km_valloc'd allocations in which we * pmap_enter in some non-kernel-object page * (e.g. vmapbuf). * * for case [1], we need to remove the mapping from * the pmap and then remove the page from the kernel * object (because, once pages in a kernel object are * unmapped they are no longer needed, unlike, say, * a vnode where you might want the data to persist * until flushed out of a queue). * * for case [2], we need to remove the mapping from * the pmap. there shouldn't be any pages at the * specified offset in the kernel object [but it * doesn't hurt to call uvm_km_pgremove just to be * safe?] * * uvm_km_pgremove currently does the following: * for pages in the kernel object range: * - drops the swap slot * - uvm_pagefree the page * * note there is version of uvm_km_pgremove() that * is used for "intrsafe" objects. */ /* * remove mappings from pmap and drop the pages * from the object. offsets are always relative * to vm_map_min(kernel_map). */ pmap_remove(pmap_kernel(), entry->start, entry->end); uvm_km_pgremove(entry->object.uvm_obj, entry->start - vm_map_min(kernel_map), entry->end - vm_map_min(kernel_map)); /* * null out kernel_object reference, we've just * dropped it */ entry->etype &= ~UVM_ET_OBJ; entry->object.uvm_obj = NULL; /* to be safe */ } else { /* remove mappings the standard way. */ pmap_remove(map->pmap, entry->start, entry->end); } } /* * Remove all entries from start to end. * * If remove_holes, then remove ET_HOLE entries as well. * If markfree, entry will be properly marked free, otherwise, no replacement * entry will be put in the tree (corrupting the tree). */ void uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end, struct uvm_map_deadq *dead, boolean_t remove_holes, boolean_t markfree) { struct vm_map_entry *prev_hint, *next, *entry; start = MAX(start, map->min_offset); end = MIN(end, map->max_offset); if (start >= end) return; if ((map->flags & VM_MAP_INTRSAFE) == 0) splassert(IPL_NONE); else splassert(IPL_VM); /* Find first affected entry. */ entry = uvm_map_entrybyaddr(&map->addr, start); KDASSERT(entry != NULL && entry->start <= start); if (entry->end <= start && markfree) entry = RBT_NEXT(uvm_map_addr, entry); else UVM_MAP_CLIP_START(map, entry, start); /* * Iterate entries until we reach end address. * prev_hint hints where the freed space can be appended to. */ prev_hint = NULL; for (; entry != NULL && entry->start < end; entry = next) { KDASSERT(entry->start >= start); if (entry->end > end || !markfree) UVM_MAP_CLIP_END(map, entry, end); KDASSERT(entry->start >= start && entry->end <= end); next = RBT_NEXT(uvm_map_addr, entry); /* Don't remove holes unless asked to do so. */ if (UVM_ET_ISHOLE(entry)) { if (!remove_holes) { prev_hint = entry; continue; } } /* A stack has been removed.. */ if (UVM_ET_ISSTACK(entry) && (map->flags & VM_MAP_ISVMSPACE)) map->sserial++; /* Kill entry. */ uvm_unmap_kill_entry(map, entry); /* Update space usage. */ if ((map->flags & VM_MAP_ISVMSPACE) && entry->object.uvm_obj == NULL && entry->protection != PROT_NONE && !UVM_ET_ISHOLE(entry)) { ((struct vmspace *)map)->vm_dused -= uvmspace_dused(map, entry->start, entry->end); } if (!UVM_ET_ISHOLE(entry)) map->size -= entry->end - entry->start; /* Actual removal of entry. */ uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree); } pmap_update(vm_map_pmap(map)); #ifdef VMMAP_DEBUG if (markfree) { for (entry = uvm_map_entrybyaddr(&map->addr, start); entry != NULL && entry->start < end; entry = RBT_NEXT(uvm_map_addr, entry)) { KDASSERT(entry->end <= start || entry->start == entry->end || UVM_ET_ISHOLE(entry)); } } else { vaddr_t a; for (a = start; a < end; a += PAGE_SIZE) KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL); } #endif } /* * Mark all entries from first until end (exclusive) as pageable. * * Lock must be exclusive on entry and will not be touched. */ void uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first, struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr) { struct vm_map_entry *iter; for (iter = first; iter != end; iter = RBT_NEXT(uvm_map_addr, iter)) { KDASSERT(iter->start >= start_addr && iter->end <= end_addr); if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter)) continue; iter->wired_count = 0; uvm_fault_unwire_locked(map, iter->start, iter->end); } } /* * Mark all entries from first until end (exclusive) as wired. * * Lockflags determines the lock state on return from this function. * Lock must be exclusive on entry. */ int uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first, struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr, int lockflags) { struct vm_map_entry *iter; #ifdef DIAGNOSTIC unsigned int timestamp_save; #endif int error; /* * Wire pages in two passes: * * 1: holding the write lock, we create any anonymous maps that need * to be created. then we clip each map entry to the region to * be wired and increment its wiring count. * * 2: we downgrade to a read lock, and call uvm_fault_wire to fault * in the pages for any newly wired area (wired_count == 1). * * downgrading to a read lock for uvm_fault_wire avoids a possible * deadlock with another thread that may have faulted on one of * the pages to be wired (it would mark the page busy, blocking * us, then in turn block on the map lock that we hold). * because we keep the read lock on the map, the copy-on-write * status of the entries we modify here cannot change. */ for (iter = first; iter != end; iter = RBT_NEXT(uvm_map_addr, iter)) { KDASSERT(iter->start >= start_addr && iter->end <= end_addr); if (UVM_ET_ISHOLE(iter) || iter->start == iter->end || iter->protection == PROT_NONE) continue; /* * Perform actions of vm_map_lookup that need the write lock. * - create an anonymous map for copy-on-write * - anonymous map for zero-fill * Skip submaps. */ if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) && UVM_ET_ISNEEDSCOPY(iter) && ((iter->protection & PROT_WRITE) || iter->object.uvm_obj == NULL)) { amap_copy(map, iter, M_WAITOK, UVM_ET_ISSTACK(iter) ? FALSE : TRUE, iter->start, iter->end); } iter->wired_count++; } /* * Pass 2. */ #ifdef DIAGNOSTIC timestamp_save = map->timestamp; #endif vm_map_busy(map); vm_map_downgrade(map); error = 0; for (iter = first; error == 0 && iter != end; iter = RBT_NEXT(uvm_map_addr, iter)) { if (UVM_ET_ISHOLE(iter) || iter->start == iter->end || iter->protection == PROT_NONE) continue; error = uvm_fault_wire(map, iter->start, iter->end, iter->protection); } if (error) { /* * uvm_fault_wire failure * * Reacquire lock and undo our work. */ vm_map_upgrade(map); vm_map_unbusy(map); #ifdef DIAGNOSTIC if (timestamp_save != map->timestamp) panic("uvm_map_pageable_wire: stale map"); #endif /* * first is no longer needed to restart loops. * Use it as iterator to unmap successful mappings. */ for (; first != iter; first = RBT_NEXT(uvm_map_addr, first)) { if (UVM_ET_ISHOLE(first) || first->start == first->end || first->protection == PROT_NONE) continue; first->wired_count--; if (!VM_MAPENT_ISWIRED(first)) { uvm_fault_unwire_locked(map, iter->start, iter->end); } } /* decrease counter in the rest of the entries */ for (; iter != end; iter = RBT_NEXT(uvm_map_addr, iter)) { if (UVM_ET_ISHOLE(iter) || iter->start == iter->end || iter->protection == PROT_NONE) continue; iter->wired_count--; } if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); return error; } /* We are currently holding a read lock. */ if ((lockflags & UVM_LK_EXIT) == 0) { vm_map_unbusy(map); vm_map_unlock_read(map); } else { vm_map_upgrade(map); vm_map_unbusy(map); #ifdef DIAGNOSTIC if (timestamp_save != map->timestamp) panic("uvm_map_pageable_wire: stale map"); #endif } return 0; } /* * uvm_map_pageable: set pageability of a range in a map. * * Flags: * UVM_LK_ENTER: map is already locked by caller * UVM_LK_EXIT: don't unlock map on exit * * The full range must be in use (entries may not have fspace != 0). * UVM_ET_HOLE counts as unmapped. */ int uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end, boolean_t new_pageable, int lockflags) { struct vm_map_entry *first, *last, *tmp; int error; start = trunc_page(start); end = round_page(end); if (start > end) return EINVAL; if (start == end) return 0; /* nothing to do */ if (start < map->min_offset) return EFAULT; /* why? see first XXX below */ if (end > map->max_offset) return EINVAL; /* why? see second XXX below */ KASSERT(map->flags & VM_MAP_PAGEABLE); if ((lockflags & UVM_LK_ENTER) == 0) vm_map_lock(map); /* * Find first entry. * * Initial test on start is different, because of the different * error returned. Rest is tested further down. */ first = uvm_map_entrybyaddr(&map->addr, start); if (first->end <= start || UVM_ET_ISHOLE(first)) { /* * XXX if the first address is not mapped, it is EFAULT? */ error = EFAULT; goto out; } /* Check that the range has no holes. */ for (last = first; last != NULL && last->start < end; last = RBT_NEXT(uvm_map_addr, last)) { if (UVM_ET_ISHOLE(last) || (last->end < end && VMMAP_FREE_END(last) != last->end)) { /* * XXX unmapped memory in range, why is it EINVAL * instead of EFAULT? */ error = EINVAL; goto out; } } /* * Last ended at the first entry after the range. * Move back one step. * * Note that last may be NULL. */ if (last == NULL) { last = RBT_MAX(uvm_map_addr, &map->addr); if (last->end < end) { error = EINVAL; goto out; } } else { KASSERT(last != first); last = RBT_PREV(uvm_map_addr, last); } /* Wire/unwire pages here. */ if (new_pageable) { /* * Mark pageable. * entries that are not wired are untouched. */ if (VM_MAPENT_ISWIRED(first)) UVM_MAP_CLIP_START(map, first, start); /* * Split last at end. * Make tmp be the first entry after what is to be touched. * If last is not wired, don't touch it. */ if (VM_MAPENT_ISWIRED(last)) { UVM_MAP_CLIP_END(map, last, end); tmp = RBT_NEXT(uvm_map_addr, last); } else tmp = last; uvm_map_pageable_pgon(map, first, tmp, start, end); error = 0; out: if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map); return error; } else { /* * Mark entries wired. * entries are always touched (because recovery needs this). */ if (!VM_MAPENT_ISWIRED(first)) UVM_MAP_CLIP_START(map, first, start); /* * Split last at end. * Make tmp be the first entry after what is to be touched. * If last is not wired, don't touch it. */ if (!VM_MAPENT_ISWIRED(last)) { UVM_MAP_CLIP_END(map, last, end); tmp = RBT_NEXT(uvm_map_addr, last); } else tmp = last; return uvm_map_pageable_wire(map, first, tmp, start, end, lockflags); } } /* * uvm_map_pageable_all: special case of uvm_map_pageable - affects * all mapped regions. * * Map must not be locked. * If no flags are specified, all ragions are unwired. */ int uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit) { vsize_t size; struct vm_map_entry *iter; KASSERT(map->flags & VM_MAP_PAGEABLE); vm_map_lock(map); if (flags == 0) { uvm_map_pageable_pgon(map, RBT_MIN(uvm_map_addr, &map->addr), NULL, map->min_offset, map->max_offset); vm_map_modflags(map, 0, VM_MAP_WIREFUTURE); vm_map_unlock(map); return 0; } if (flags & MCL_FUTURE) vm_map_modflags(map, VM_MAP_WIREFUTURE, 0); if (!(flags & MCL_CURRENT)) { vm_map_unlock(map); return 0; } /* * Count number of pages in all non-wired entries. * If the number exceeds the limit, abort. */ size = 0; RBT_FOREACH(iter, uvm_map_addr, &map->addr) { if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter)) continue; size += iter->end - iter->start; } if (atop(size) + uvmexp.wired > uvmexp.wiredmax) { vm_map_unlock(map); return ENOMEM; } /* XXX non-pmap_wired_count case must be handled by caller */ #ifdef pmap_wired_count if (limit != 0 && size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) { vm_map_unlock(map); return ENOMEM; } #endif /* * uvm_map_pageable_wire will release lock */ return uvm_map_pageable_wire(map, RBT_MIN(uvm_map_addr, &map->addr), NULL, map->min_offset, map->max_offset, 0); } /* * Initialize map. * * Allocates sufficient entries to describe the free memory in the map. */ void uvm_map_setup(struct vm_map *map, pmap_t pmap, vaddr_t min, vaddr_t max, int flags) { int i; KASSERT((min & (vaddr_t)PAGE_MASK) == 0); KASSERT((max & (vaddr_t)PAGE_MASK) == 0 || (max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK); /* * Update parameters. * * This code handles (vaddr_t)-1 and other page mask ending addresses * properly. * We lose the top page if the full virtual address space is used. */ if (max & (vaddr_t)PAGE_MASK) { max += 1; if (max == 0) /* overflow */ max -= PAGE_SIZE; } RBT_INIT(uvm_map_addr, &map->addr); map->uaddr_exe = NULL; for (i = 0; i < nitems(map->uaddr_any); ++i) map->uaddr_any[i] = NULL; map->uaddr_brk_stack = NULL; map->pmap = pmap; map->size = 0; map->ref_count = 0; map->min_offset = min; map->max_offset = max; map->b_start = map->b_end = 0; /* Empty brk() area by default. */ map->s_start = map->s_end = 0; /* Empty stack area by default. */ map->flags = flags; map->timestamp = 0; if (flags & VM_MAP_ISVMSPACE) rw_init_flags(&map->lock, "vmmaplk", RWL_DUPOK); else rw_init(&map->lock, "kmmaplk"); mtx_init(&map->mtx, IPL_VM); mtx_init(&map->flags_lock, IPL_VM); /* Configure the allocators. */ if (flags & VM_MAP_ISVMSPACE) uvm_map_setup_md(map); else map->uaddr_any[3] = &uaddr_kbootstrap; /* * Fill map entries. * We do not need to write-lock the map here because only the current * thread sees it right now. Initialize ref_count to 0 above to avoid * bogus triggering of lock-not-held assertions. */ uvm_map_setup_entries(map); uvm_tree_sanity(map, __FILE__, __LINE__); map->ref_count = 1; } /* * Destroy the map. * * This is the inverse operation to uvm_map_setup. */ void uvm_map_teardown(struct vm_map *map) { struct uvm_map_deadq dead_entries; struct vm_map_entry *entry, *tmp; #ifdef VMMAP_DEBUG size_t numq, numt; #endif int i; KERNEL_ASSERT_LOCKED(); KERNEL_UNLOCK(); KERNEL_ASSERT_UNLOCKED(); KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); /* Remove address selectors. */ uvm_addr_destroy(map->uaddr_exe); map->uaddr_exe = NULL; for (i = 0; i < nitems(map->uaddr_any); i++) { uvm_addr_destroy(map->uaddr_any[i]); map->uaddr_any[i] = NULL; } uvm_addr_destroy(map->uaddr_brk_stack); map->uaddr_brk_stack = NULL; /* * Remove entries. * * The following is based on graph breadth-first search. * * In color terms: * - the dead_entries set contains all nodes that are reachable * (i.e. both the black and the grey nodes) * - any entry not in dead_entries is white * - any entry that appears in dead_entries before entry, * is black, the rest is grey. * The set [entry, end] is also referred to as the wavefront. * * Since the tree is always a fully connected graph, the breadth-first * search guarantees that each vmmap_entry is visited exactly once. * The vm_map is broken down in linear time. */ TAILQ_INIT(&dead_entries); if ((entry = RBT_ROOT(uvm_map_addr, &map->addr)) != NULL) DEAD_ENTRY_PUSH(&dead_entries, entry); while (entry != NULL) { sched_pause(yield); uvm_unmap_kill_entry(map, entry); if ((tmp = RBT_LEFT(uvm_map_addr, entry)) != NULL) DEAD_ENTRY_PUSH(&dead_entries, tmp); if ((tmp = RBT_RIGHT(uvm_map_addr, entry)) != NULL) DEAD_ENTRY_PUSH(&dead_entries, tmp); /* Update wave-front. */ entry = TAILQ_NEXT(entry, dfree.deadq); } #ifdef VMMAP_DEBUG numt = numq = 0; RBT_FOREACH(entry, uvm_map_addr, &map->addr) numt++; TAILQ_FOREACH(entry, &dead_entries, dfree.deadq) numq++; KASSERT(numt == numq); #endif uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK); KERNEL_LOCK(); pmap_destroy(map->pmap); map->pmap = NULL; } /* * Populate map with free-memory entries. * * Map must be initialized and empty. */ void uvm_map_setup_entries(struct vm_map *map) { KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr)); uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0); } /* * Split entry at given address. * * orig: entry that is to be split. * next: a newly allocated map entry that is not linked. * split: address at which the split is done. */ void uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig, struct vm_map_entry *next, vaddr_t split) { struct uvm_addr_state *free, *free_before; vsize_t adj; if ((split & PAGE_MASK) != 0) { panic("uvm_map_splitentry: split address 0x%lx " "not on page boundary!", split); } KDASSERT(map != NULL && orig != NULL && next != NULL); uvm_tree_sanity(map, __FILE__, __LINE__); KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split); #ifdef VMMAP_DEBUG KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, orig) == orig); KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, next) != next); #endif /* VMMAP_DEBUG */ /* * Free space will change, unlink from free space tree. */ free = uvm_map_uaddr_e(map, orig); uvm_mapent_free_remove(map, free, orig); adj = split - orig->start; uvm_mapent_copy(orig, next); if (split >= orig->end) { next->etype = 0; next->offset = 0; next->wired_count = 0; next->start = next->end = split; next->guard = 0; next->fspace = VMMAP_FREE_END(orig) - split; next->aref.ar_amap = NULL; next->aref.ar_pageoff = 0; orig->guard = MIN(orig->guard, split - orig->end); orig->fspace = split - VMMAP_FREE_START(orig); } else { orig->fspace = 0; orig->guard = 0; orig->end = next->start = split; if (next->aref.ar_amap) { amap_splitref(&orig->aref, &next->aref, adj); } if (UVM_ET_ISSUBMAP(orig)) { uvm_map_reference(next->object.sub_map); next->offset += adj; } else if (UVM_ET_ISOBJ(orig)) { if (next->object.uvm_obj->pgops && next->object.uvm_obj->pgops->pgo_reference) { KERNEL_LOCK(); next->object.uvm_obj->pgops->pgo_reference( next->object.uvm_obj); KERNEL_UNLOCK(); } next->offset += adj; } } /* * Link next into address tree. * Link orig and next into free-space tree. * * Don't insert 'next' into the addr tree until orig has been linked, * in case the free-list looks at adjecent entries in the addr tree * for its decisions. */ if (orig->fspace > 0) free_before = free; else free_before = uvm_map_uaddr_e(map, orig); uvm_mapent_free_insert(map, free_before, orig); uvm_mapent_addr_insert(map, next); uvm_mapent_free_insert(map, free, next); uvm_tree_sanity(map, __FILE__, __LINE__); } #ifdef VMMAP_DEBUG void uvm_tree_assert(struct vm_map *map, int test, char *test_str, char *file, int line) { char* map_special; if (test) return; if (map == kernel_map) map_special = " (kernel_map)"; else if (map == kmem_map) map_special = " (kmem_map)"; else map_special = ""; panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file, line, test_str); } /* * Check that map is sane. */ void uvm_tree_sanity(struct vm_map *map, char *file, int line) { struct vm_map_entry *iter; vaddr_t addr; vaddr_t min, max, bound; /* Bounds checker. */ struct uvm_addr_state *free; addr = vm_map_min(map); RBT_FOREACH(iter, uvm_map_addr, &map->addr) { /* * Valid start, end. * Catch overflow for end+fspace. */ UVM_ASSERT(map, iter->end >= iter->start, file, line); UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line); /* May not be empty. */ UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter), file, line); /* Addresses for entry must lie within map boundaries. */ UVM_ASSERT(map, iter->start >= vm_map_min(map) && VMMAP_FREE_END(iter) <= vm_map_max(map), file, line); /* Tree may not have gaps. */ UVM_ASSERT(map, iter->start == addr, file, line); addr = VMMAP_FREE_END(iter); /* * Free space may not cross boundaries, unless the same * free list is used on both sides of the border. */ min = VMMAP_FREE_START(iter); max = VMMAP_FREE_END(iter); while (min < max && (bound = uvm_map_boundary(map, min, max)) != max) { UVM_ASSERT(map, uvm_map_uaddr(map, bound - 1) == uvm_map_uaddr(map, bound), file, line); min = bound; } free = uvm_map_uaddr_e(map, iter); if (free) { UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0, file, line); } else { UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0, file, line); } } UVM_ASSERT(map, addr == vm_map_max(map), file, line); } void uvm_tree_size_chk(struct vm_map *map, char *file, int line) { struct vm_map_entry *iter; vsize_t size; size = 0; RBT_FOREACH(iter, uvm_map_addr, &map->addr) { if (!UVM_ET_ISHOLE(iter)) size += iter->end - iter->start; } if (map->size != size) printf("map size = 0x%lx, should be 0x%lx\n", map->size, size); UVM_ASSERT(map, map->size == size, file, line); vmspace_validate(map); } /* * This function validates the statistics on vmspace. */ void vmspace_validate(struct vm_map *map) { struct vmspace *vm; struct vm_map_entry *iter; vaddr_t imin, imax; vaddr_t stack_begin, stack_end; /* Position of stack. */ vsize_t stack, heap; /* Measured sizes. */ if (!(map->flags & VM_MAP_ISVMSPACE)) return; vm = (struct vmspace *)map; stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); stack = heap = 0; RBT_FOREACH(iter, uvm_map_addr, &map->addr) { imin = imax = iter->start; if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL || iter->prot != PROT_NONE) continue; /* * Update stack, heap. * Keep in mind that (theoretically) the entries of * userspace and stack may be joined. */ while (imin != iter->end) { /* * Set imax to the first boundary crossed between * imin and stack addresses. */ imax = iter->end; if (imin < stack_begin && imax > stack_begin) imax = stack_begin; else if (imin < stack_end && imax > stack_end) imax = stack_end; if (imin >= stack_begin && imin < stack_end) stack += imax - imin; else heap += imax - imin; imin = imax; } } heap >>= PAGE_SHIFT; if (heap != vm->vm_dused) { printf("vmspace stack range: 0x%lx-0x%lx\n", stack_begin, stack_end); panic("vmspace_validate: vmspace.vm_dused invalid, " "expected %ld pgs, got %ld pgs in map %p", heap, vm->vm_dused, map); } } #endif /* VMMAP_DEBUG */ /* * uvm_map_init: init mapping system at boot time. note that we allocate * and init the static pool of structs vm_map_entry for the kernel here. */ void uvm_map_init(void) { static struct vm_map_entry kernel_map_entry[MAX_KMAPENT]; int lcv; /* now set up static pool of kernel map entries ... */ mtx_init(&uvm_kmapent_mtx, IPL_VM); SLIST_INIT(&uvm.kentry_free); for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) { SLIST_INSERT_HEAD(&uvm.kentry_free, &kernel_map_entry[lcv], daddrs.addr_kentry); } /* initialize the map-related pools. */ pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), 0, IPL_NONE, PR_WAITOK, "vmsppl", NULL); pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), 0, IPL_VM, PR_WAITOK, "vmmpepl", NULL); pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), 0, IPL_VM, 0, "vmmpekpl", NULL); pool_sethiwat(&uvm_map_entry_pool, 8192); uvm_addr_init(); } #if defined(DDB) /* * DDB hooks */ /* * uvm_map_printit: actually prints the map */ void uvm_map_printit(struct vm_map *map, boolean_t full, int (*pr)(const char *, ...)) { struct vmspace *vm; struct vm_map_entry *entry; struct uvm_addr_state *free; int in_free, i; char buf[8]; (*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset); (*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n", map->b_start, map->b_end); (*pr)("\tstack allocate range: 0x%lx-0x%lx\n", map->s_start, map->s_end); (*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n", map->size, map->ref_count, map->timestamp, map->flags); (*pr)("\tpmap=%p(resident=%d)\n", map->pmap, pmap_resident_count(map->pmap)); /* struct vmspace handling. */ if (map->flags & VM_MAP_ISVMSPACE) { vm = (struct vmspace *)map; (*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n", vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss); (*pr)("\tvm_tsize=%u vm_dsize=%u\n", vm->vm_tsize, vm->vm_dsize); (*pr)("\tvm_taddr=%p vm_daddr=%p\n", vm->vm_taddr, vm->vm_daddr); (*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n", vm->vm_maxsaddr, vm->vm_minsaddr); } if (!full) goto print_uaddr; RBT_FOREACH(entry, uvm_map_addr, &map->addr) { (*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n", entry, entry->start, entry->end, entry->object.uvm_obj, (long long)entry->offset, entry->aref.ar_amap, entry->aref.ar_pageoff); (*pr)("\tsubmap=%c, cow=%c, nc=%c, stack=%c, " "syscall=%c, prot(max)=%d/%d, inh=%d, " "wc=%d, adv=%d\n", (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F', (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F', (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F', (entry->etype & UVM_ET_STACK) ? 'T' : 'F', (entry->etype & UVM_ET_SYSCALL) ? 'T' : 'F', entry->protection, entry->max_protection, entry->inheritance, entry->wired_count, entry->advice); free = uvm_map_uaddr_e(map, entry); in_free = (free != NULL); (*pr)("\thole=%c, free=%c, guard=0x%lx, " "free=0x%lx-0x%lx\n", (entry->etype & UVM_ET_HOLE) ? 'T' : 'F', in_free ? 'T' : 'F', entry->guard, VMMAP_FREE_START(entry), VMMAP_FREE_END(entry)); (*pr)("\tfspace_augment=%lu\n", entry->fspace_augment); (*pr)("\tfreemapped=%c, uaddr=%p\n", (entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free); if (free) { (*pr)("\t\t(0x%lx-0x%lx %s)\n", free->uaddr_minaddr, free->uaddr_maxaddr, free->uaddr_functions->uaddr_name); } } print_uaddr: uvm_addr_print(map->uaddr_exe, "exe", full, pr); for (i = 0; i < nitems(map->uaddr_any); i++) { snprintf(&buf[0], sizeof(buf), "any[%d]", i); uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr); } uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr); } /* * uvm_object_printit: actually prints the object */ void uvm_object_printit(struct uvm_object *uobj, boolean_t full, int (*pr)(const char *, ...)) { struct vm_page *pg; int cnt = 0; (*pr)("OBJECT %p: pgops=%p, npages=%d, ", uobj, uobj->pgops, uobj->uo_npages); if (UVM_OBJ_IS_KERN_OBJECT(uobj)) (*pr)("refs=<SYSTEM>\n"); else (*pr)("refs=%d\n", uobj->uo_refs); if (!full) { return; } (*pr)(" PAGES <pg,offset>:\n "); RBT_FOREACH(pg, uvm_objtree, &uobj->memt) { (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); if ((cnt % 3) == 2) { (*pr)("\n "); } cnt++; } if ((cnt % 3) != 2) { (*pr)("\n"); } } /* * uvm_page_printit: actually print the page */ static const char page_flagbits[] = "\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY" "\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ" "\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5"; void uvm_page_printit(struct vm_page *pg, boolean_t full, int (*pr)(const char *, ...)) { struct vm_page *tpg; struct uvm_object *uobj; struct pglist *pgl; (*pr)("PAGE %p:\n", pg); (*pr)(" flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n", pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count, (long long)pg->phys_addr); (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n", pg->uobject, pg->uanon, (long long)pg->offset); #if defined(UVM_PAGE_TRKOWN) if (pg->pg_flags & PG_BUSY) (*pr)(" owning thread = %d, tag=%s", pg->owner, pg->owner_tag); else (*pr)(" page not busy, no owner"); #else (*pr)(" [page ownership tracking disabled]"); #endif (*pr)("\tvm_page_md %p\n", &pg->mdpage); if (!full) return; /* cross-verify object/anon */ if ((pg->pg_flags & PQ_FREE) == 0) { if (pg->pg_flags & PQ_ANON) { if (pg->uanon == NULL || pg->uanon->an_page != pg) (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", (pg->uanon) ? pg->uanon->an_page : NULL); else (*pr)(" anon backpointer is OK\n"); } else { uobj = pg->uobject; if (uobj) { (*pr)(" checking object list\n"); RBT_FOREACH(tpg, uvm_objtree, &uobj->memt) { if (tpg == pg) { break; } } if (tpg) (*pr)(" page found on object list\n"); else (*pr)(" >>> PAGE NOT FOUND " "ON OBJECT LIST! <<<\n"); } } } /* cross-verify page queue */ if (pg->pg_flags & PQ_FREE) { if (uvm_pmr_isfree(pg)) (*pr)(" page found in uvm_pmemrange\n"); else (*pr)(" >>> page not found in uvm_pmemrange <<<\n"); pgl = NULL; } else if (pg->pg_flags & PQ_INACTIVE) { pgl = (pg->pg_flags & PQ_SWAPBACKED) ? &uvm.page_inactive_swp : &uvm.page_inactive_obj; } else if (pg->pg_flags & PQ_ACTIVE) { pgl = &uvm.page_active; } else { pgl = NULL; } if (pgl) { (*pr)(" checking pageq list\n"); TAILQ_FOREACH(tpg, pgl, pageq) { if (tpg == pg) { break; } } if (tpg) (*pr)(" page found on pageq list\n"); else (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); } } #endif /* * uvm_map_protect: change map protection * * => set_max means set max_protection. * => map must be unlocked. */ int uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, vm_prot_t new_prot, boolean_t set_max) { struct vm_map_entry *first, *iter; vm_prot_t old_prot; vm_prot_t mask; vsize_t dused; int error; if (start > end) return EINVAL; start = MAX(start, map->min_offset); end = MIN(end, map->max_offset); if (start >= end) return 0; dused = 0; error = 0; vm_map_lock(map); /* * Set up first and last. * - first will contain first entry at or after start. */ first = uvm_map_entrybyaddr(&map->addr, start); KDASSERT(first != NULL); if (first->end <= start) first = RBT_NEXT(uvm_map_addr, first); /* First, check for protection violations. */ for (iter = first; iter != NULL && iter->start < end; iter = RBT_NEXT(uvm_map_addr, iter)) { /* Treat memory holes as free space. */ if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) continue; old_prot = iter->protection; if (old_prot == PROT_NONE && new_prot != old_prot) { dused += uvmspace_dused( map, MAX(start, iter->start), MIN(end, iter->end)); } if (UVM_ET_ISSUBMAP(iter)) { error = EINVAL; goto out; } if ((new_prot & iter->max_protection) != new_prot) { error = EACCES; goto out; } if (map == kernel_map && (new_prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) panic("uvm_map_protect: kernel map W^X violation requested"); } /* Check limits. */ if (dused > 0 && (map->flags & VM_MAP_ISVMSPACE)) { vsize_t limit = lim_cur(RLIMIT_DATA); dused = ptoa(dused); if (limit < dused || limit - dused < ptoa(((struct vmspace *)map)->vm_dused)) { error = ENOMEM; goto out; } } /* Fix protections. */ for (iter = first; iter != NULL && iter->start < end; iter = RBT_NEXT(uvm_map_addr, iter)) { /* Treat memory holes as free space. */ if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) continue; old_prot = iter->protection; /* * Skip adapting protection iff old and new protection * are equal. */ if (set_max) { if (old_prot == (new_prot & old_prot) && iter->max_protection == new_prot) continue; } else { if (old_prot == new_prot) continue; } UVM_MAP_CLIP_START(map, iter, start); UVM_MAP_CLIP_END(map, iter, end); if (set_max) { iter->max_protection = new_prot; iter->protection &= new_prot; } else iter->protection = new_prot; /* * update physical map if necessary. worry about copy-on-write * here -- CHECK THIS XXX */ if (iter->protection != old_prot) { mask = UVM_ET_ISCOPYONWRITE(iter) ? ~PROT_WRITE : PROT_MASK; /* XXX should only wserial++ if no split occurs */ if (iter->protection & PROT_WRITE) map->wserial++; if (map->flags & VM_MAP_ISVMSPACE) { if (old_prot == PROT_NONE) { ((struct vmspace *)map)->vm_dused += uvmspace_dused(map, iter->start, iter->end); } if (iter->protection == PROT_NONE) { ((struct vmspace *)map)->vm_dused -= uvmspace_dused(map, iter->start, iter->end); } } /* update pmap */ if ((iter->protection & mask) == PROT_NONE && VM_MAPENT_ISWIRED(iter)) { /* * TODO(ariane) this is stupid. wired_count * is 0 if not wired, otherwise anything * larger than 0 (incremented once each time * wire is called). * Mostly to be able to undo the damage on * failure. Not the actually be a wired * refcounter... * Originally: iter->wired_count--; * (don't we have to unwire this in the pmap * as well?) */ iter->wired_count = 0; } pmap_protect(map->pmap, iter->start, iter->end, iter->protection & mask); } /* * If the map is configured to lock any future mappings, * wire this entry now if the old protection was PROT_NONE * and the new protection is not PROT_NONE. */ if ((map->flags & VM_MAP_WIREFUTURE) != 0 && VM_MAPENT_ISWIRED(iter) == 0 && old_prot == PROT_NONE && new_prot != PROT_NONE) { if (uvm_map_pageable(map, iter->start, iter->end, FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) { /* * If locking the entry fails, remember the * error if it's the first one. Note we * still continue setting the protection in * the map, but it will return the resource * storage condition regardless. * * XXX Ignore what the actual error is, * XXX just call it a resource shortage * XXX so that it doesn't get confused * XXX what uvm_map_protect() itself would * XXX normally return. */ error = ENOMEM; } } } pmap_update(map->pmap); out: vm_map_unlock(map); return error; } /* * uvmspace_alloc: allocate a vmspace structure. * * - structure includes vm_map and pmap * - XXX: no locking on this structure * - refcnt set to 1, rest must be init'd by caller */ struct vmspace * uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable, boolean_t remove_holes) { struct vmspace *vm; vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO); uvmspace_init(vm, NULL, min, max, pageable, remove_holes); return (vm); } /* * uvmspace_init: initialize a vmspace structure. * * - XXX: no locking on this structure * - refcnt set to 1, rest must be init'd by caller */ void uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max, boolean_t pageable, boolean_t remove_holes) { KASSERT(pmap == NULL || pmap == pmap_kernel()); if (pmap) pmap_reference(pmap); else pmap = pmap_create(); uvm_map_setup(&vm->vm_map, pmap, min, max, (pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE); vm->vm_refcnt = 1; if (remove_holes) pmap_remove_holes(vm); } /* * uvmspace_share: share a vmspace between two processes * * - used for vfork */ struct vmspace * uvmspace_share(struct process *pr) { struct vmspace *vm = pr->ps_vmspace; uvmspace_addref(vm); return vm; } /* * uvmspace_exec: the process wants to exec a new program * * - XXX: no locking on vmspace */ void uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end) { struct process *pr = p->p_p; struct vmspace *nvm, *ovm = pr->ps_vmspace; struct vm_map *map = &ovm->vm_map; struct uvm_map_deadq dead_entries; KASSERT((start & (vaddr_t)PAGE_MASK) == 0); KASSERT((end & (vaddr_t)PAGE_MASK) == 0 || (end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK); pmap_unuse_final(p); /* before stack addresses go away */ TAILQ_INIT(&dead_entries); /* see if more than one process is using this vmspace... */ if (ovm->vm_refcnt == 1) { /* * If pr is the only process using its vmspace then * we can safely recycle that vmspace for the program * that is being exec'd. */ #ifdef SYSVSHM /* * SYSV SHM semantics require us to kill all segments on an exec */ if (ovm->vm_shm) shmexit(ovm); #endif /* * POSIX 1003.1b -- "lock future mappings" is revoked * when a process execs another program image. */ vm_map_lock(map); vm_map_modflags(map, 0, VM_MAP_WIREFUTURE|VM_MAP_SYSCALL_ONCE); /* * now unmap the old program * * Instead of attempting to keep the map valid, we simply * nuke all entries and ask uvm_map_setup to reinitialize * the map to the new boundaries. * * uvm_unmap_remove will actually nuke all entries for us * (as in, not replace them with free-memory entries). */ uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead_entries, TRUE, FALSE); KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr)); /* Nuke statistics and boundaries. */ memset(&ovm->vm_startcopy, 0, (caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy); if (end & (vaddr_t)PAGE_MASK) { end += 1; if (end == 0) /* overflow */ end -= PAGE_SIZE; } /* Setup new boundaries and populate map with entries. */ map->min_offset = start; map->max_offset = end; uvm_map_setup_entries(map); vm_map_unlock(map); /* but keep MMU holes unavailable */ pmap_remove_holes(ovm); } else { /* * pr's vmspace is being shared, so we can't reuse * it for pr since it is still being used for others. * allocate a new vmspace for pr */ nvm = uvmspace_alloc(start, end, (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE); /* install new vmspace and drop our ref to the old one. */ pmap_deactivate(p); p->p_vmspace = pr->ps_vmspace = nvm; pmap_activate(p); uvmspace_free(ovm); } /* Release dead entries */ uvm_unmap_detach(&dead_entries, 0); } /* * uvmspace_addref: add a reference to a vmspace. */ void uvmspace_addref(struct vmspace *vm) { KERNEL_ASSERT_LOCKED(); KASSERT(vm->vm_refcnt > 0); vm->vm_refcnt++; } /* * uvmspace_free: free a vmspace data structure */ void uvmspace_free(struct vmspace *vm) { KERNEL_ASSERT_LOCKED(); if (--vm->vm_refcnt == 0) { /* * lock the map, to wait out all other references to it. delete * all of the mappings and pages they hold, then call the pmap * module to reclaim anything left. */ #ifdef SYSVSHM /* Get rid of any SYSV shared memory segments. */ if (vm->vm_shm != NULL) shmexit(vm); #endif uvm_map_teardown(&vm->vm_map); pool_put(&uvm_vmspace_pool, vm); } } /* * uvm_share: Map the address range [srcaddr, srcaddr + sz) in * srcmap to the address range [dstaddr, dstaddr + sz) in * dstmap. * * The whole address range in srcmap must be backed by an object * (no holes). * * If successful, the address ranges share memory and the destination * address range uses the protection flags in prot. * * This routine assumes that sz is a multiple of PAGE_SIZE and * that dstaddr and srcaddr are page-aligned. */ int uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot, struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz) { int ret = 0; vaddr_t unmap_end; vaddr_t dstva; vsize_t s_off, len, n = sz, remain; struct vm_map_entry *first = NULL, *last = NULL; struct vm_map_entry *src_entry, *psrc_entry = NULL; struct uvm_map_deadq dead; if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr) return EINVAL; TAILQ_INIT(&dead); vm_map_lock(dstmap); vm_map_lock_read(srcmap); if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) { ret = ENOMEM; goto exit_unlock; } if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) { ret = EINVAL; goto exit_unlock; } dstva = dstaddr; unmap_end = dstaddr; for (; src_entry != NULL; psrc_entry = src_entry, src_entry = RBT_NEXT(uvm_map_addr, src_entry)) { /* hole in address space, bail out */ if (psrc_entry != NULL && psrc_entry->end != src_entry->start) break; if (src_entry->start >= srcaddr + sz) break; if (UVM_ET_ISSUBMAP(src_entry)) panic("uvm_share: encountered a submap (illegal)"); if (!UVM_ET_ISCOPYONWRITE(src_entry) && UVM_ET_ISNEEDSCOPY(src_entry)) panic("uvm_share: non-copy_on_write map entries " "marked needs_copy (illegal)"); /* * srcaddr > map entry start? means we are in the middle of a * map, so we calculate the offset to use in the source map. */ if (srcaddr > src_entry->start) s_off = srcaddr - src_entry->start; else if (srcaddr == src_entry->start) s_off = 0; else panic("uvm_share: map entry start > srcaddr"); remain = src_entry->end - src_entry->start - s_off; /* Determine how many bytes to share in this pass */ if (n < remain) len = n; else len = remain; if (uvm_mapent_share(dstmap, dstva, len, s_off, prot, prot, srcmap, src_entry, &dead) == NULL) break; n -= len; dstva += len; srcaddr += len; unmap_end = dstva + len; if (n == 0) goto exit_unlock; } ret = EINVAL; uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE); exit_unlock: vm_map_unlock_read(srcmap); vm_map_unlock(dstmap); uvm_unmap_detach(&dead, 0); return ret; } /* * Clone map entry into other map. * * Mapping will be placed at dstaddr, for the same length. * Space must be available. * Reference counters are incremented. */ struct vm_map_entry * uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen, vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map_entry *old_entry, struct uvm_map_deadq *dead, int mapent_flags, int amap_share_flags) { struct vm_map_entry *new_entry, *first, *last; KDASSERT(!UVM_ET_ISSUBMAP(old_entry)); /* Create new entry (linked in on creation). Fill in first, last. */ first = last = NULL; if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) { panic("uvm_mapent_clone: no space in map for " "entry in empty map"); } new_entry = uvm_map_mkentry(dstmap, first, last, dstaddr, dstlen, mapent_flags, dead, NULL); if (new_entry == NULL) return NULL; /* old_entry -> new_entry */ new_entry->object = old_entry->object; new_entry->offset = old_entry->offset; new_entry->aref = old_entry->aref; new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED; new_entry->protection = prot; new_entry->max_protection = maxprot; new_entry->inheritance = old_entry->inheritance; new_entry->advice = old_entry->advice; /* gain reference to object backing the map (can't be a submap). */ if (new_entry->aref.ar_amap) { new_entry->aref.ar_pageoff += off >> PAGE_SHIFT; amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff, (new_entry->end - new_entry->start) >> PAGE_SHIFT, amap_share_flags); } if (UVM_ET_ISOBJ(new_entry) && new_entry->object.uvm_obj->pgops->pgo_reference) { new_entry->offset += off; new_entry->object.uvm_obj->pgops->pgo_reference (new_entry->object.uvm_obj); } return new_entry; } struct vm_map_entry * uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen, vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map, struct vm_map_entry *old_entry, struct uvm_map_deadq *dead) { /* * If old_entry refers to a copy-on-write region that has not yet been * written to (needs_copy flag is set), then we need to allocate a new * amap for old_entry. * * If we do not do this, and the process owning old_entry does a copy-on * write later, old_entry and new_entry will refer to different memory * regions, and the memory between the processes is no longer shared. * * [in other words, we need to clear needs_copy] */ if (UVM_ET_ISNEEDSCOPY(old_entry)) { /* get our own amap, clears needs_copy */ amap_copy(old_map, old_entry, M_WAITOK, FALSE, 0, 0); /* XXXCDC: WAITOK??? */ } return uvm_mapent_clone(dstmap, dstaddr, dstlen, off, prot, maxprot, old_entry, dead, 0, AMAP_SHARED); } /* * share the mapping: this means we want the old and * new entries to share amaps and backing objects. */ struct vm_map_entry * uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map, struct vm_map *old_map, struct vm_map_entry *old_entry, struct uvm_map_deadq *dead) { struct vm_map_entry *new_entry; new_entry = uvm_mapent_share(new_map, old_entry->start, old_entry->end - old_entry->start, 0, old_entry->protection, old_entry->max_protection, old_map, old_entry, dead); /* * pmap_copy the mappings: this routine is optional * but if it is there it will reduce the number of * page faults in the new proc. */ if (!UVM_ET_ISHOLE(new_entry)) pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (new_entry->end - new_entry->start), new_entry->start); return (new_entry); } /* * copy-on-write the mapping (using mmap's * MAP_PRIVATE semantics) * * allocate new_entry, adjust reference counts. * (note that new references are read-only). */ struct vm_map_entry * uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map, struct vm_map *old_map, struct vm_map_entry *old_entry, struct uvm_map_deadq *dead) { struct vm_map_entry *new_entry; boolean_t protect_child; new_entry = uvm_mapent_clone(new_map, old_entry->start, old_entry->end - old_entry->start, 0, old_entry->protection, old_entry->max_protection, old_entry, dead, 0, 0); new_entry->etype |= (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY); /* * the new entry will need an amap. it will either * need to be copied from the old entry or created * from scratch (if the old entry does not have an * amap). can we defer this process until later * (by setting "needs_copy") or do we need to copy * the amap now? * * we must copy the amap now if any of the following * conditions hold: * 1. the old entry has an amap and that amap is * being shared. this means that the old (parent) * process is sharing the amap with another * process. if we do not clear needs_copy here * we will end up in a situation where both the * parent and child process are referring to the * same amap with "needs_copy" set. if the * parent write-faults, the fault routine will * clear "needs_copy" in the parent by allocating * a new amap. this is wrong because the * parent is supposed to be sharing the old amap * and the new amap will break that. * * 2. if the old entry has an amap and a non-zero * wire count then we are going to have to call * amap_cow_now to avoid page faults in the * parent process. since amap_cow_now requires * "needs_copy" to be clear we might as well * clear it here as well. * */ if (old_entry->aref.ar_amap != NULL && ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 || VM_MAPENT_ISWIRED(old_entry))) { amap_copy(new_map, new_entry, M_WAITOK, FALSE, 0, 0); /* XXXCDC: M_WAITOK ... ok? */ } /* * if the parent's entry is wired down, then the * parent process does not want page faults on * access to that memory. this means that we * cannot do copy-on-write because we can't write * protect the old entry. in this case we * resolve all copy-on-write faults now, using * amap_cow_now. note that we have already * allocated any needed amap (above). */ if (VM_MAPENT_ISWIRED(old_entry)) { /* * resolve all copy-on-write faults now * (note that there is nothing to do if * the old mapping does not have an amap). * XXX: is it worthwhile to bother with * pmap_copy in this case? */ if (old_entry->aref.ar_amap) amap_cow_now(new_map, new_entry); } else { if (old_entry->aref.ar_amap) { /* * setup mappings to trigger copy-on-write faults * we must write-protect the parent if it has * an amap and it is not already "needs_copy"... * if it is already "needs_copy" then the parent * has already been write-protected by a previous * fork operation. * * if we do not write-protect the parent, then * we must be sure to write-protect the child * after the pmap_copy() operation. * * XXX: pmap_copy should have some way of telling * us that it didn't do anything so we can avoid * calling pmap_protect needlessly. */ if (!UVM_ET_ISNEEDSCOPY(old_entry)) { if (old_entry->max_protection & PROT_WRITE) { pmap_protect(old_map->pmap, old_entry->start, old_entry->end, old_entry->protection & ~PROT_WRITE); pmap_update(old_map->pmap); } old_entry->etype |= UVM_ET_NEEDSCOPY; } /* parent must now be write-protected */ protect_child = FALSE; } else { /* * we only need to protect the child if the * parent has write access. */ if (old_entry->max_protection & PROT_WRITE) protect_child = TRUE; else protect_child = FALSE; } /* * copy the mappings * XXX: need a way to tell if this does anything */ if (!UVM_ET_ISHOLE(new_entry)) pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); /* protect the child's mappings if necessary */ if (protect_child) { pmap_protect(new_map->pmap, new_entry->start, new_entry->end, new_entry->protection & ~PROT_WRITE); } } return (new_entry); } /* * zero the mapping: the new entry will be zero initialized */ struct vm_map_entry * uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map, struct vm_map *old_map, struct vm_map_entry *old_entry, struct uvm_map_deadq *dead) { struct vm_map_entry *new_entry; new_entry = uvm_mapent_clone(new_map, old_entry->start, old_entry->end - old_entry->start, 0, old_entry->protection, old_entry->max_protection, old_entry, dead, 0, 0); new_entry->etype |= (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY); if (new_entry->aref.ar_amap) { amap_unref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff, atop(new_entry->end - new_entry->start), 0); new_entry->aref.ar_amap = NULL; new_entry->aref.ar_pageoff = 0; } if (UVM_ET_ISOBJ(new_entry)) { if (new_entry->object.uvm_obj->pgops->pgo_detach) new_entry->object.uvm_obj->pgops->pgo_detach( new_entry->object.uvm_obj); new_entry->object.uvm_obj = NULL; new_entry->etype &= ~UVM_ET_OBJ; } return (new_entry); } /* * uvmspace_fork: fork a process' main map * * => create a new vmspace for child process from parent. * => parent's map must not be locked. */ struct vmspace * uvmspace_fork(struct process *pr) { struct vmspace *vm1 = pr->ps_vmspace; struct vmspace *vm2; struct vm_map *old_map = &vm1->vm_map; struct vm_map *new_map; struct vm_map_entry *old_entry, *new_entry; struct uvm_map_deadq dead; vm_map_lock(old_map); vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset, (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE); memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy, (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); vm2->vm_dused = 0; /* Statistic managed by us. */ new_map = &vm2->vm_map; vm_map_lock(new_map); /* go entry-by-entry */ TAILQ_INIT(&dead); RBT_FOREACH(old_entry, uvm_map_addr, &old_map->addr) { if (old_entry->start == old_entry->end) continue; /* first, some sanity checks on the old entry */ if (UVM_ET_ISSUBMAP(old_entry)) { panic("fork: encountered a submap during fork " "(illegal)"); } if (!UVM_ET_ISCOPYONWRITE(old_entry) && UVM_ET_ISNEEDSCOPY(old_entry)) { panic("fork: non-copy_on_write map entry marked " "needs_copy (illegal)"); } /* Apply inheritance. */ switch (old_entry->inheritance) { case MAP_INHERIT_SHARE: new_entry = uvm_mapent_forkshared(vm2, new_map, old_map, old_entry, &dead); break; case MAP_INHERIT_COPY: new_entry = uvm_mapent_forkcopy(vm2, new_map, old_map, old_entry, &dead); break; case MAP_INHERIT_ZERO: new_entry = uvm_mapent_forkzero(vm2, new_map, old_map, old_entry, &dead); break; default: continue; } /* Update process statistics. */ if (!UVM_ET_ISHOLE(new_entry)) new_map->size += new_entry->end - new_entry->start; if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry) && new_entry->protection != PROT_NONE) { vm2->vm_dused += uvmspace_dused( new_map, new_entry->start, new_entry->end); } } vm_map_unlock(old_map); vm_map_unlock(new_map); /* * This can actually happen, if multiple entries described a * space in which an entry was inherited. */ uvm_unmap_detach(&dead, 0); #ifdef SYSVSHM if (vm1->vm_shm) shmfork(vm1, vm2); #endif return vm2; } /* * uvm_map_hint: return the beginning of the best area suitable for * creating a new mapping with "prot" protection. */ vaddr_t uvm_map_hint(struct vmspace *vm, vm_prot_t prot, vaddr_t minaddr, vaddr_t maxaddr) { vaddr_t addr; vaddr_t spacing; #ifdef __i386__ /* * If executable skip first two pages, otherwise start * after data + heap region. */ if ((prot & PROT_EXEC) != 0 && (vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) { addr = (PAGE_SIZE*2) + (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1)); return (round_page(addr)); } #endif #if defined (__LP64__) spacing = MIN(4UL * 1024 * 1024 * 1024, MAXDSIZ) - 1; #else spacing = MIN(1 * 1024 * 1024 * 1024, MAXDSIZ) - 1; #endif /* * Start malloc/mmap after the brk. */ addr = (vaddr_t)vm->vm_daddr + BRKSIZ; addr = MAX(addr, minaddr); if (addr < maxaddr) { while (spacing > maxaddr - addr) spacing >>= 1; } addr += arc4random() & spacing; return (round_page(addr)); } /* * uvm_map_submap: punch down part of a map into a submap * * => only the kernel_map is allowed to be submapped * => the purpose of submapping is to break up the locking granularity * of a larger map * => the range specified must have been mapped previously with a uvm_map() * call [with uobj==NULL] to create a blank map entry in the main map. * [And it had better still be blank!] * => maps which contain submaps should never be copied or forked. * => to remove a submap, use uvm_unmap() on the main map * and then uvm_map_deallocate() the submap. * => main map must be unlocked. * => submap must have been init'd and have a zero reference count. * [need not be locked as we don't actually reference it] */ int uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end, struct vm_map *submap) { struct vm_map_entry *entry; int result; if (start > map->max_offset || end > map->max_offset || start < map->min_offset || end < map->min_offset) return EINVAL; vm_map_lock(map); if (uvm_map_lookup_entry(map, start, &entry)) { UVM_MAP_CLIP_START(map, entry, start); UVM_MAP_CLIP_END(map, entry, end); } else entry = NULL; if (entry != NULL && entry->start == start && entry->end == end && entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL && !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) { entry->etype |= UVM_ET_SUBMAP; entry->object.sub_map = submap; entry->offset = 0; uvm_map_reference(submap); result = 0; } else result = EINVAL; vm_map_unlock(map); return result; } /* * uvm_map_checkprot: check protection in map * * => must allow specific protection in a fully allocated region. * => map mut be read or write locked by caller. */ boolean_t uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end, vm_prot_t protection) { struct vm_map_entry *entry; if (start < map->min_offset || end > map->max_offset || start > end) return FALSE; if (start == end) return TRUE; /* * Iterate entries. */ for (entry = uvm_map_entrybyaddr(&map->addr, start); entry != NULL && entry->start < end; entry = RBT_NEXT(uvm_map_addr, entry)) { /* Fail if a hole is found. */ if (UVM_ET_ISHOLE(entry) || (entry->end < end && entry->end != VMMAP_FREE_END(entry))) return FALSE; /* Check protection. */ if ((entry->protection & protection) != protection) return FALSE; } return TRUE; } /* * uvm_map_create: create map */ vm_map_t uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags) { vm_map_t map; map = malloc(sizeof *map, M_VMMAP, M_WAITOK); uvm_map_setup(map, pmap, min, max, flags); return (map); } /* * uvm_map_deallocate: drop reference to a map * * => caller must not lock map * => we will zap map if ref count goes to zero */ void uvm_map_deallocate(vm_map_t map) { int c; struct uvm_map_deadq dead; c = --map->ref_count; if (c > 0) { return; } /* * all references gone. unmap and free. * * No lock required: we are only one to access this map. */ TAILQ_INIT(&dead); uvm_tree_sanity(map, __FILE__, __LINE__); uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead, TRUE, FALSE); pmap_destroy(map->pmap); KASSERT(RBT_EMPTY(uvm_map_addr, &map->addr)); free(map, M_VMMAP, sizeof *map); uvm_unmap_detach(&dead, 0); } /* * uvm_map_inherit: set inheritance code for range of addrs in map. * * => map must be unlocked * => note that the inherit code is used during a "fork". see fork * code for details. */ int uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end, vm_inherit_t new_inheritance) { struct vm_map_entry *entry; switch (new_inheritance) { case MAP_INHERIT_NONE: case MAP_INHERIT_COPY: case MAP_INHERIT_SHARE: case MAP_INHERIT_ZERO: break; default: return (EINVAL); } if (start > end) return EINVAL; start = MAX(start, map->min_offset); end = MIN(end, map->max_offset); if (start >= end) return 0; vm_map_lock(map); entry = uvm_map_entrybyaddr(&map->addr, start); if (entry->end > start) UVM_MAP_CLIP_START(map, entry, start); else entry = RBT_NEXT(uvm_map_addr, entry); while (entry != NULL && entry->start < end) { UVM_MAP_CLIP_END(map, entry, end); entry->inheritance = new_inheritance; entry = RBT_NEXT(uvm_map_addr, entry); } vm_map_unlock(map); return (0); } /* * uvm_map_syscall: permit system calls for range of addrs in map. * * => map must be unlocked */ int uvm_map_syscall(struct vm_map *map, vaddr_t start, vaddr_t end) { struct vm_map_entry *entry; if (start > end) return EINVAL; start = MAX(start, map->min_offset); end = MIN(end, map->max_offset); if (start >= end) return 0; if (map->flags & VM_MAP_SYSCALL_ONCE) /* only allowed once */ return (EPERM); vm_map_lock(map); entry = uvm_map_entrybyaddr(&map->addr, start); if (entry->end > start) UVM_MAP_CLIP_START(map, entry, start); else entry = RBT_NEXT(uvm_map_addr, entry); while (entry != NULL && entry->start < end) { UVM_MAP_CLIP_END(map, entry, end); entry->etype |= UVM_ET_SYSCALL; entry = RBT_NEXT(uvm_map_addr, entry); } map->wserial++; map->flags |= VM_MAP_SYSCALL_ONCE; vm_map_unlock(map); return (0); } /* * uvm_map_advice: set advice code for range of addrs in map. * * => map must be unlocked */ int uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice) { struct vm_map_entry *entry; switch (new_advice) { case MADV_NORMAL: case MADV_RANDOM: case MADV_SEQUENTIAL: break; default: return (EINVAL); } if (start > end) return EINVAL; start = MAX(start, map->min_offset); end = MIN(end, map->max_offset); if (start >= end) return 0; vm_map_lock(map); entry = uvm_map_entrybyaddr(&map->addr, start); if (entry != NULL && entry->end > start) UVM_MAP_CLIP_START(map, entry, start); else if (entry!= NULL) entry = RBT_NEXT(uvm_map_addr, entry); /* * XXXJRT: disallow holes? */ while (entry != NULL && entry->start < end) { UVM_MAP_CLIP_END(map, entry, end); entry->advice = new_advice; entry = RBT_NEXT(uvm_map_addr, entry); } vm_map_unlock(map); return (0); } /* * uvm_map_extract: extract a mapping from a map and put it somewhere * in the kernel_map, setting protection to max_prot. * * => map should be unlocked (we will write lock it and kernel_map) * => returns 0 on success, error code otherwise * => start must be page aligned * => len must be page sized * => flags: * UVM_EXTRACT_FIXPROT: set prot to maxprot as we go * Mappings are QREF's. */ int uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len, vaddr_t *dstaddrp, int flags) { struct uvm_map_deadq dead; struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2; vaddr_t dstaddr; vaddr_t end; vaddr_t cp_start; vsize_t cp_len, cp_off; int error; TAILQ_INIT(&dead); end = start + len; /* * Sanity check on the parameters. * Also, since the mapping may not contain gaps, error out if the * mapped area is not in source map. */ if ((start & (vaddr_t)PAGE_MASK) != 0 || (end & (vaddr_t)PAGE_MASK) != 0 || end < start) return EINVAL; if (start < srcmap->min_offset || end > srcmap->max_offset) return EINVAL; /* Initialize dead entries. Handle len == 0 case. */ if (len == 0) return 0; /* Acquire lock on srcmap. */ vm_map_lock(srcmap); /* Lock srcmap, lookup first and last entry in <start,len>. */ first = uvm_map_entrybyaddr(&srcmap->addr, start); /* Check that the range is contiguous. */ for (entry = first; entry != NULL && entry->end < end; entry = RBT_NEXT(uvm_map_addr, entry)) { if (VMMAP_FREE_END(entry) != entry->end || UVM_ET_ISHOLE(entry)) { error = EINVAL; goto fail; } } if (entry == NULL || UVM_ET_ISHOLE(entry)) { error = EINVAL; goto fail; } /* * Handle need-copy flag. */ for (entry = first; entry != NULL && entry->start < end; entry = RBT_NEXT(uvm_map_addr, entry)) { if (UVM_ET_ISNEEDSCOPY(entry)) amap_copy(srcmap, entry, M_NOWAIT, UVM_ET_ISSTACK(entry) ? FALSE : TRUE, start, end); if (UVM_ET_ISNEEDSCOPY(entry)) { /* * amap_copy failure */ error = ENOMEM; goto fail; } } /* Lock destination map (kernel_map). */ vm_map_lock(kernel_map); if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len, MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()), PMAP_PREFER_OFFSET(start), PROT_NONE, 0) != 0) { error = ENOMEM; goto fail2; } *dstaddrp = dstaddr; /* * We now have srcmap and kernel_map locked. * dstaddr contains the destination offset in dstmap. */ /* step 1: start looping through map entries, performing extraction. */ for (entry = first; entry != NULL && entry->start < end; entry = RBT_NEXT(uvm_map_addr, entry)) { KDASSERT(!UVM_ET_ISNEEDSCOPY(entry)); if (UVM_ET_ISHOLE(entry)) continue; /* Calculate uvm_mapent_clone parameters. */ cp_start = entry->start; if (cp_start < start) { cp_off = start - cp_start; cp_start = start; } else cp_off = 0; cp_len = MIN(entry->end, end) - cp_start; newentry = uvm_mapent_clone(kernel_map, cp_start - start + dstaddr, cp_len, cp_off, entry->protection, entry->max_protection, entry, &dead, flags, AMAP_SHARED | AMAP_REFALL); if (newentry == NULL) { error = ENOMEM; goto fail2_unmap; } kernel_map->size += cp_len; if (flags & UVM_EXTRACT_FIXPROT) newentry->protection = newentry->max_protection; /* * Step 2: perform pmap copy. * (Doing this in the loop saves one RB traversal.) */ pmap_copy(kernel_map->pmap, srcmap->pmap, cp_start - start + dstaddr, cp_len, cp_start); } pmap_update(kernel_map->pmap); error = 0; /* Unmap copied entries on failure. */ fail2_unmap: if (error) { uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead, FALSE, TRUE); } /* Release maps, release dead entries. */ fail2: vm_map_unlock(kernel_map); fail: vm_map_unlock(srcmap); uvm_unmap_detach(&dead, 0); return error; } /* * uvm_map_clean: clean out a map range * * => valid flags: * if (flags & PGO_CLEANIT): dirty pages are cleaned first * if (flags & PGO_SYNCIO): dirty pages are written synchronously * if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean * if (flags & PGO_FREE): any cached pages are freed after clean * => returns an error if any part of the specified range isn't mapped * => never a need to flush amap layer since the anonymous memory has * no permanent home, but may deactivate pages there * => called from sys_msync() and sys_madvise() * => caller must not write-lock map (read OK). * => we may sleep while cleaning if SYNCIO [with map read-locked] */ int uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) { struct vm_map_entry *first, *entry; struct vm_amap *amap; struct vm_anon *anon; struct vm_page *pg; struct uvm_object *uobj; vaddr_t cp_start, cp_end; int refs; int error; boolean_t rv; KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) != (PGO_FREE|PGO_DEACTIVATE)); if (start > end || start < map->min_offset || end > map->max_offset) return EINVAL; vm_map_lock_read(map); first = uvm_map_entrybyaddr(&map->addr, start); /* Make a first pass to check for holes. */ for (entry = first; entry != NULL && entry->start < end; entry = RBT_NEXT(uvm_map_addr, entry)) { if (UVM_ET_ISSUBMAP(entry)) { vm_map_unlock_read(map); return EINVAL; } if (UVM_ET_ISSUBMAP(entry) || UVM_ET_ISHOLE(entry) || (entry->end < end && VMMAP_FREE_END(entry) != entry->end)) { vm_map_unlock_read(map); return EFAULT; } } error = 0; for (entry = first; entry != NULL && entry->start < end; entry = RBT_NEXT(uvm_map_addr, entry)) { amap = entry->aref.ar_amap; /* top layer */ if (UVM_ET_ISOBJ(entry)) uobj = entry->object.uvm_obj; else uobj = NULL; /* * No amap cleaning necessary if: * - there's no amap * - we're not deactivating or freeing pages. */ if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) goto flush_object; cp_start = MAX(entry->start, start); cp_end = MIN(entry->end, end); amap_lock(amap); for (; cp_start != cp_end; cp_start += PAGE_SIZE) { anon = amap_lookup(&entry->aref, cp_start - entry->start); if (anon == NULL) continue; KASSERT(anon->an_lock == amap->am_lock); pg = anon->an_page; if (pg == NULL) { continue; } KASSERT(pg->pg_flags & PQ_ANON); switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) { /* * XXX In these first 3 cases, we always just * XXX deactivate the page. We may want to * XXX handle the different cases more * XXX specifically, in the future. */ case PGO_CLEANIT|PGO_FREE: case PGO_CLEANIT|PGO_DEACTIVATE: case PGO_DEACTIVATE: deactivate_it: /* skip the page if it's wired */ if (pg->wire_count != 0) break; uvm_lock_pageq(); KASSERT(pg->uanon == anon); /* zap all mappings for the page. */ pmap_page_protect(pg, PROT_NONE); /* ...and deactivate the page. */ uvm_pagedeactivate(pg); uvm_unlock_pageq(); break; case PGO_FREE: /* * If there are multiple references to * the amap, just deactivate the page. */ if (amap_refs(amap) > 1) goto deactivate_it; /* XXX skip the page if it's wired */ if (pg->wire_count != 0) { break; } amap_unadd(&entry->aref, cp_start - entry->start); refs = --anon->an_ref; if (refs == 0) uvm_anfree(anon); break; default: panic("uvm_map_clean: weird flags"); } } amap_unlock(amap); flush_object: cp_start = MAX(entry->start, start); cp_end = MIN(entry->end, end); /* * flush pages if we've got a valid backing object. * * Don't PGO_FREE if we don't have write permission * and don't flush if this is a copy-on-write object * since we can't know our permissions on it. */ if (uobj != NULL && ((flags & PGO_FREE) == 0 || ((entry->max_protection & PROT_WRITE) != 0 && (entry->etype & UVM_ET_COPYONWRITE) == 0))) { rv = uobj->pgops->pgo_flush(uobj, cp_start - entry->start + entry->offset, cp_end - entry->start + entry->offset, flags); if (rv == FALSE) error = EFAULT; } } vm_map_unlock_read(map); return error; } /* * UVM_MAP_CLIP_END implementation */ void uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr) { struct vm_map_entry *tmp; KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr); tmp = uvm_mapent_alloc(map, 0); /* Invoke splitentry. */ uvm_map_splitentry(map, entry, tmp, addr); } /* * UVM_MAP_CLIP_START implementation * * Clippers are required to not change the pointers to the entry they are * clipping on. * Since uvm_map_splitentry turns the original entry into the lowest * entry (address wise) we do a swap between the new entry and the original * entry, prior to calling uvm_map_splitentry. */ void uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr) { struct vm_map_entry *tmp; struct uvm_addr_state *free; /* Unlink original. */ free = uvm_map_uaddr_e(map, entry); uvm_mapent_free_remove(map, free, entry); uvm_mapent_addr_remove(map, entry); /* Copy entry. */ KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr); tmp = uvm_mapent_alloc(map, 0); uvm_mapent_copy(entry, tmp); /* Put new entry in place of original entry. */ uvm_mapent_addr_insert(map, tmp); uvm_mapent_free_insert(map, free, tmp); /* Invoke splitentry. */ uvm_map_splitentry(map, tmp, entry, addr); } /* * Boundary fixer. */ static inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t); static inline vaddr_t uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound) { return (min < bound && max > bound) ? bound : max; } /* * Choose free list based on address at start of free space. * * The uvm_addr_state returned contains addr and is the first of: * - uaddr_exe * - uaddr_brk_stack * - uaddr_any */ struct uvm_addr_state* uvm_map_uaddr(struct vm_map *map, vaddr_t addr) { struct uvm_addr_state *uaddr; int i; /* Special case the first page, to prevent mmap from returning 0. */ if (addr < VMMAP_MIN_ADDR) return NULL; /* Upper bound for kernel maps at uvm_maxkaddr. */ if ((map->flags & VM_MAP_ISVMSPACE) == 0) { if (addr >= uvm_maxkaddr) return NULL; } /* Is the address inside the exe-only map? */ if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr && addr < map->uaddr_exe->uaddr_maxaddr) return map->uaddr_exe; /* Check if the space falls inside brk/stack area. */ if ((addr >= map->b_start && addr < map->b_end) || (addr >= map->s_start && addr < map->s_end)) { if (map->uaddr_brk_stack != NULL && addr >= map->uaddr_brk_stack->uaddr_minaddr && addr < map->uaddr_brk_stack->uaddr_maxaddr) { return map->uaddr_brk_stack; } else return NULL; } /* * Check the other selectors. * * These selectors are only marked as the owner, if they have insert * functions. */ for (i = 0; i < nitems(map->uaddr_any); i++) { uaddr = map->uaddr_any[i]; if (uaddr == NULL) continue; if (uaddr->uaddr_functions->uaddr_free_insert == NULL) continue; if (addr >= uaddr->uaddr_minaddr && addr < uaddr->uaddr_maxaddr) return uaddr; } return NULL; } /* * Choose free list based on address at start of free space. * * The uvm_addr_state returned contains addr and is the first of: * - uaddr_exe * - uaddr_brk_stack * - uaddr_any */ struct uvm_addr_state* uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry) { return uvm_map_uaddr(map, VMMAP_FREE_START(entry)); } /* * Returns the first free-memory boundary that is crossed by [min-max]. */ vsize_t uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max) { struct uvm_addr_state *uaddr; int i; /* Never return first page. */ max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR); /* Treat the maxkaddr special, if the map is a kernel_map. */ if ((map->flags & VM_MAP_ISVMSPACE) == 0) max = uvm_map_boundfix(min, max, uvm_maxkaddr); /* Check for exe-only boundaries. */ if (map->uaddr_exe != NULL) { max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr); max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr); } /* Check for exe-only boundaries. */ if (map->uaddr_brk_stack != NULL) { max = uvm_map_boundfix(min, max, map->uaddr_brk_stack->uaddr_minaddr); max = uvm_map_boundfix(min, max, map->uaddr_brk_stack->uaddr_maxaddr); } /* Check other boundaries. */ for (i = 0; i < nitems(map->uaddr_any); i++) { uaddr = map->uaddr_any[i]; if (uaddr != NULL) { max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr); max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr); } } /* Boundaries at stack and brk() area. */ max = uvm_map_boundfix(min, max, map->s_start); max = uvm_map_boundfix(min, max, map->s_end); max = uvm_map_boundfix(min, max, map->b_start); max = uvm_map_boundfix(min, max, map->b_end); return max; } /* * Update map allocation start and end addresses from proc vmspace. */ void uvm_map_vmspace_update(struct vm_map *map, struct uvm_map_deadq *dead, int flags) { struct vmspace *vm; vaddr_t b_start, b_end, s_start, s_end; KASSERT(map->flags & VM_MAP_ISVMSPACE); KASSERT(offsetof(struct vmspace, vm_map) == 0); /* * Derive actual allocation boundaries from vmspace. */ vm = (struct vmspace *)map; b_start = (vaddr_t)vm->vm_daddr; b_end = b_start + BRKSIZ; s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); s_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); #ifdef DIAGNOSTIC if ((b_start & (vaddr_t)PAGE_MASK) != 0 || (b_end & (vaddr_t)PAGE_MASK) != 0 || (s_start & (vaddr_t)PAGE_MASK) != 0 || (s_end & (vaddr_t)PAGE_MASK) != 0) { panic("uvm_map_vmspace_update: vmspace %p invalid bounds: " "b=0x%lx-0x%lx s=0x%lx-0x%lx", vm, b_start, b_end, s_start, s_end); } #endif if (__predict_true(map->b_start == b_start && map->b_end == b_end && map->s_start == s_start && map->s_end == s_end)) return; uvm_map_freelist_update(map, dead, b_start, b_end, s_start, s_end, flags); } /* * Grow kernel memory. * * This function is only called for kernel maps when an allocation fails. * * If the map has a gap that is large enough to accommodate alloc_sz, this * function will make sure map->free will include it. */ void uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead, vsize_t alloc_sz, int flags) { vsize_t sz; vaddr_t end; struct vm_map_entry *entry; /* Kernel memory only. */ KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0); /* Destroy free list. */ uvm_map_freelist_update_clear(map, dead); /* Include the guard page in the hard minimum requirement of alloc_sz. */ if (map->flags & VM_MAP_GUARDPAGES) alloc_sz += PAGE_SIZE; /* * Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA. * * Don't handle the case where the multiplication overflows: * if that happens, the allocation is probably too big anyway. */ sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA); /* * Walk forward until a gap large enough for alloc_sz shows up. * * We assume the kernel map has no boundaries. * uvm_maxkaddr may be zero. */ end = MAX(uvm_maxkaddr, map->min_offset); entry = uvm_map_entrybyaddr(&map->addr, end); while (entry && entry->fspace < alloc_sz) entry = RBT_NEXT(uvm_map_addr, entry); if (entry) { end = MAX(VMMAP_FREE_START(entry), end); end += MIN(sz, map->max_offset - end); } else end = map->max_offset; /* Reserve pmap entries. */ #ifdef PMAP_GROWKERNEL uvm_maxkaddr = pmap_growkernel(end); #else uvm_maxkaddr = MAX(uvm_maxkaddr, end); #endif /* Rebuild free list. */ uvm_map_freelist_update_refill(map, flags); } /* * Freelist update subfunction: unlink all entries from freelists. */ void uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead) { struct uvm_addr_state *free; struct vm_map_entry *entry, *prev, *next; prev = NULL; for (entry = RBT_MIN(uvm_map_addr, &map->addr); entry != NULL; entry = next) { next = RBT_NEXT(uvm_map_addr, entry); free = uvm_map_uaddr_e(map, entry); uvm_mapent_free_remove(map, free, entry); if (prev != NULL && entry->start == entry->end) { prev->fspace += VMMAP_FREE_END(entry) - entry->end; uvm_mapent_addr_remove(map, entry); DEAD_ENTRY_PUSH(dead, entry); } else prev = entry; } } /* * Freelist update subfunction: refill the freelists with entries. */ void uvm_map_freelist_update_refill(struct vm_map *map, int flags) { struct vm_map_entry *entry; vaddr_t min, max; RBT_FOREACH(entry, uvm_map_addr, &map->addr) { min = VMMAP_FREE_START(entry); max = VMMAP_FREE_END(entry); entry->fspace = 0; entry = uvm_map_fix_space(map, entry, min, max, flags); } uvm_tree_sanity(map, __FILE__, __LINE__); } /* * Change {a,b}_{start,end} allocation ranges and associated free lists. */ void uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead, vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags) { KDASSERT(b_end >= b_start && s_end >= s_start); /* Clear all free lists. */ uvm_map_freelist_update_clear(map, dead); /* Apply new bounds. */ map->b_start = b_start; map->b_end = b_end; map->s_start = s_start; map->s_end = s_end; /* Refill free lists. */ uvm_map_freelist_update_refill(map, flags); } /* * Assign a uvm_addr_state to the specified pointer in vm_map. * * May sleep. */ void uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which, struct uvm_addr_state *newval) { struct uvm_map_deadq dead; /* Pointer which must be in this map. */ KASSERT(which != NULL); KASSERT((void*)map <= (void*)(which) && (void*)(which) < (void*)(map + 1)); vm_map_lock(map); TAILQ_INIT(&dead); uvm_map_freelist_update_clear(map, &dead); uvm_addr_destroy(*which); *which = newval; uvm_map_freelist_update_refill(map, 0); vm_map_unlock(map); uvm_unmap_detach(&dead, 0); } /* * Correct space insert. * * Entry must not be on any freelist. */ struct vm_map_entry* uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry, vaddr_t min, vaddr_t max, int flags) { struct uvm_addr_state *free, *entfree; vaddr_t lmax; KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0); KDASSERT(min <= max); KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) || min == map->min_offset); /* * During the function, entfree will always point at the uaddr state * for entry. */ entfree = (entry == NULL ? NULL : uvm_map_uaddr_e(map, entry)); while (min != max) { /* Claim guard page for entry. */ if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL && VMMAP_FREE_END(entry) == entry->end && entry->start != entry->end) { if (max - min == 2 * PAGE_SIZE) { /* * If the free-space gap is exactly 2 pages, * we make the guard 2 pages instead of 1. * Because in a guarded map, an area needs * at least 2 pages to allocate from: * one page for the allocation and one for * the guard. */ entry->guard = 2 * PAGE_SIZE; min = max; } else { entry->guard = PAGE_SIZE; min += PAGE_SIZE; } continue; } /* * Handle the case where entry has a 2-page guard, but the * space after entry is freed. */ if (entry != NULL && entry->fspace == 0 && entry->guard > PAGE_SIZE) { entry->guard = PAGE_SIZE; min = VMMAP_FREE_START(entry); } lmax = uvm_map_boundary(map, min, max); free = uvm_map_uaddr(map, min); /* * Entries are merged if they point at the same uvm_free(). * Exception to that rule: if min == uvm_maxkaddr, a new * entry is started regardless (otherwise the allocators * will get confused). */ if (entry != NULL && free == entfree && !((map->flags & VM_MAP_ISVMSPACE) == 0 && min == uvm_maxkaddr)) { KDASSERT(VMMAP_FREE_END(entry) == min); entry->fspace += lmax - min; } else { /* * Commit entry to free list: it'll not be added to * anymore. * We'll start a new entry and add to that entry * instead. */ if (entry != NULL) uvm_mapent_free_insert(map, entfree, entry); /* New entry for new uaddr. */ entry = uvm_mapent_alloc(map, flags); KDASSERT(entry != NULL); entry->end = entry->start = min; entry->guard = 0; entry->fspace = lmax - min; entry->object.uvm_obj = NULL; entry->offset = 0; entry->etype = 0; entry->protection = entry->max_protection = 0; entry->inheritance = 0; entry->wired_count = 0; entry->advice = 0; entry->aref.ar_pageoff = 0; entry->aref.ar_amap = NULL; uvm_mapent_addr_insert(map, entry); entfree = free; } min = lmax; } /* Finally put entry on the uaddr state. */ if (entry != NULL) uvm_mapent_free_insert(map, entfree, entry); return entry; } /* * MQuery style of allocation. * * This allocator searches forward until sufficient space is found to map * the given size. * * XXX: factor in offset (via pmap_prefer) and protection? */ int uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset, int flags) { struct vm_map_entry *entry, *last; vaddr_t addr; vaddr_t tmp, pmap_align, pmap_offset; int error; addr = *addr_p; vm_map_lock_read(map); /* Configure pmap prefer. */ if (offset != UVM_UNKNOWN_OFFSET) { pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()); pmap_offset = PMAP_PREFER_OFFSET(offset); } else { pmap_align = PAGE_SIZE; pmap_offset = 0; } /* Align address to pmap_prefer unless FLAG_FIXED is set. */ if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) { tmp = (addr & ~(pmap_align - 1)) | pmap_offset; if (tmp < addr) tmp += pmap_align; addr = tmp; } /* First, check if the requested range is fully available. */ entry = uvm_map_entrybyaddr(&map->addr, addr); last = NULL; if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) { error = 0; goto out; } if (flags & UVM_FLAG_FIXED) { error = EINVAL; goto out; } error = ENOMEM; /* Default error from here. */ /* * At this point, the memory at <addr, sz> is not available. * The reasons are: * [1] it's outside the map, * [2] it starts in used memory (and therefore needs to move * toward the first free page in entry), * [3] it starts in free memory but bumps into used memory. * * Note that for case [2], the forward moving is handled by the * for loop below. */ if (entry == NULL) { /* [1] Outside the map. */ if (addr >= map->max_offset) goto out; else entry = RBT_MIN(uvm_map_addr, &map->addr); } else if (VMMAP_FREE_START(entry) <= addr) { /* [3] Bumped into used memory. */ entry = RBT_NEXT(uvm_map_addr, entry); } /* Test if the next entry is sufficient for the allocation. */ for (; entry != NULL; entry = RBT_NEXT(uvm_map_addr, entry)) { if (entry->fspace == 0) continue; addr = VMMAP_FREE_START(entry); restart: /* Restart address checks on address change. */ tmp = (addr & ~(pmap_align - 1)) | pmap_offset; if (tmp < addr) tmp += pmap_align; addr = tmp; if (addr >= VMMAP_FREE_END(entry)) continue; /* Skip brk() allocation addresses. */ if (addr + sz > map->b_start && addr < map->b_end) { if (VMMAP_FREE_END(entry) > map->b_end) { addr = map->b_end; goto restart; } else continue; } /* Skip stack allocation addresses. */ if (addr + sz > map->s_start && addr < map->s_end) { if (VMMAP_FREE_END(entry) > map->s_end) { addr = map->s_end; goto restart; } else continue; } last = NULL; if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) { error = 0; goto out; } } out: vm_map_unlock_read(map); if (error == 0) *addr_p = addr; return error; } boolean_t vm_map_lock_try_ln(struct vm_map *map, char *file, int line) { boolean_t rv; if (map->flags & VM_MAP_INTRSAFE) { rv = mtx_enter_try(&map->mtx); } else { mtx_enter(&map->flags_lock); if (map->flags & VM_MAP_BUSY) { mtx_leave(&map->flags_lock); return (FALSE); } mtx_leave(&map->flags_lock); rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0); /* check if the lock is busy and back out if we won the race */ if (rv) { mtx_enter(&map->flags_lock); if (map->flags & VM_MAP_BUSY) { rw_exit(&map->lock); rv = FALSE; } mtx_leave(&map->flags_lock); } } if (rv) { map->timestamp++; LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); uvm_tree_sanity(map, file, line); uvm_tree_size_chk(map, file, line); } return (rv); } void vm_map_lock_ln(struct vm_map *map, char *file, int line) { if ((map->flags & VM_MAP_INTRSAFE) == 0) { do { mtx_enter(&map->flags_lock); tryagain: while (map->flags & VM_MAP_BUSY) { map->flags |= VM_MAP_WANTLOCK; msleep_nsec(&map->flags, &map->flags_lock, PVM, vmmapbsy, INFSLP); } mtx_leave(&map->flags_lock); } while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0); /* check if the lock is busy and back out if we won the race */ mtx_enter(&map->flags_lock); if (map->flags & VM_MAP_BUSY) { rw_exit(&map->lock); goto tryagain; } mtx_leave(&map->flags_lock); } else { mtx_enter(&map->mtx); } map->timestamp++; LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); uvm_tree_sanity(map, file, line); uvm_tree_size_chk(map, file, line); } void vm_map_lock_read_ln(struct vm_map *map, char *file, int line) { if ((map->flags & VM_MAP_INTRSAFE) == 0) rw_enter_read(&map->lock); else mtx_enter(&map->mtx); LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); uvm_tree_sanity(map, file, line); uvm_tree_size_chk(map, file, line); } void vm_map_unlock_ln(struct vm_map *map, char *file, int line) { uvm_tree_sanity(map, file, line); uvm_tree_size_chk(map, file, line); LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); if ((map->flags & VM_MAP_INTRSAFE) == 0) rw_exit(&map->lock); else mtx_leave(&map->mtx); } void vm_map_unlock_read_ln(struct vm_map *map, char *file, int line) { /* XXX: RO */ uvm_tree_sanity(map, file, line); /* XXX: RO */ uvm_tree_size_chk(map, file, line); LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); if ((map->flags & VM_MAP_INTRSAFE) == 0) rw_exit_read(&map->lock); else mtx_leave(&map->mtx); } void vm_map_downgrade_ln(struct vm_map *map, char *file, int line) { uvm_tree_sanity(map, file, line); uvm_tree_size_chk(map, file, line); LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); if ((map->flags & VM_MAP_INTRSAFE) == 0) rw_enter(&map->lock, RW_DOWNGRADE); } void vm_map_upgrade_ln(struct vm_map *map, char *file, int line) { /* XXX: RO */ uvm_tree_sanity(map, file, line); /* XXX: RO */ uvm_tree_size_chk(map, file, line); LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); if ((map->flags & VM_MAP_INTRSAFE) == 0) { rw_exit_read(&map->lock); rw_enter_write(&map->lock); } LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); uvm_tree_sanity(map, file, line); } void vm_map_busy_ln(struct vm_map *map, char *file, int line) { KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); mtx_enter(&map->flags_lock); map->flags |= VM_MAP_BUSY; mtx_leave(&map->flags_lock); } void vm_map_unbusy_ln(struct vm_map *map, char *file, int line) { int oflags; KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); mtx_enter(&map->flags_lock); oflags = map->flags; map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK); mtx_leave(&map->flags_lock); if (oflags & VM_MAP_WANTLOCK) wakeup(&map->flags); } #ifndef SMALL_KERNEL int uvm_map_fill_vmmap(struct vm_map *map, struct kinfo_vmentry *kve, size_t *lenp) { struct vm_map_entry *entry; vaddr_t start; int cnt, maxcnt, error = 0; KASSERT(*lenp > 0); KASSERT((*lenp % sizeof(*kve)) == 0); cnt = 0; maxcnt = *lenp / sizeof(*kve); KASSERT(maxcnt > 0); /* * Return only entries whose address is above the given base * address. This allows userland to iterate without knowing the * number of entries beforehand. */ start = (vaddr_t)kve[0].kve_start; vm_map_lock(map); RBT_FOREACH(entry, uvm_map_addr, &map->addr) { if (cnt == maxcnt) { error = ENOMEM; break; } if (start != 0 && entry->start < start) continue; kve->kve_start = entry->start; kve->kve_end = entry->end; kve->kve_guard = entry->guard; kve->kve_fspace = entry->fspace; kve->kve_fspace_augment = entry->fspace_augment; kve->kve_offset = entry->offset; kve->kve_wired_count = entry->wired_count; kve->kve_etype = entry->etype; kve->kve_protection = entry->protection; kve->kve_max_protection = entry->max_protection; kve->kve_advice = entry->advice; kve->kve_inheritance = entry->inheritance; kve->kve_flags = entry->flags; kve++; cnt++; } vm_map_unlock(map); KASSERT(cnt <= maxcnt); *lenp = sizeof(*kve) * cnt; return error; } #endif RBT_GENERATE_AUGMENT(uvm_map_addr, vm_map_entry, daddrs.addr_entry, uvm_mapentry_addrcmp, uvm_map_addr_augment); /* * MD code: vmspace allocator setup. */ #ifdef __i386__ void uvm_map_setup_md(struct vm_map *map) { vaddr_t min, max; min = map->min_offset; max = map->max_offset; /* * Ensure the selectors will not try to manage page 0; * it's too special. */ if (min < VMMAP_MIN_ADDR) min = VMMAP_MIN_ADDR; #if 0 /* Cool stuff, not yet */ /* Executable code is special. */ map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR); /* Place normal allocations beyond executable mappings. */ map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, max); #else /* Crappy stuff, for now */ map->uaddr_any[0] = uaddr_rnd_create(min, max); #endif #ifndef SMALL_KERNEL map->uaddr_brk_stack = uaddr_stack_brk_create(min, max); #endif /* !SMALL_KERNEL */ } #elif __LP64__ void uvm_map_setup_md(struct vm_map *map) { vaddr_t min, max; min = map->min_offset; max = map->max_offset; /* * Ensure the selectors will not try to manage page 0; * it's too special. */ if (min < VMMAP_MIN_ADDR) min = VMMAP_MIN_ADDR; #if 0 /* Cool stuff, not yet */ map->uaddr_any[3] = uaddr_pivot_create(MAX(min, 0x100000000ULL), max); #else /* Crappy stuff, for now */ map->uaddr_any[0] = uaddr_rnd_create(min, max); #endif #ifndef SMALL_KERNEL map->uaddr_brk_stack = uaddr_stack_brk_create(min, max); #endif /* !SMALL_KERNEL */ } #else /* non-i386, 32 bit */ void uvm_map_setup_md(struct vm_map *map) { vaddr_t min, max; min = map->min_offset; max = map->max_offset; /* * Ensure the selectors will not try to manage page 0; * it's too special. */ if (min < VMMAP_MIN_ADDR) min = VMMAP_MIN_ADDR; #if 0 /* Cool stuff, not yet */ map->uaddr_any[3] = uaddr_pivot_create(min, max); #else /* Crappy stuff, for now */ map->uaddr_any[0] = uaddr_rnd_create(min, max); #endif #ifndef SMALL_KERNEL map->uaddr_brk_stack = uaddr_stack_brk_create(min, max); #endif /* !SMALL_KERNEL */ } #endif
2151 2139 356 355 355 356 356 273 159 161 1 161 180 180 179 180 161 161 133 134 /* $OpenBSD: rnd.c,v 1.222 2021/03/06 09:20:49 jsg Exp $ */ /* * Copyright (c) 2011,2020 Theo de Raadt. * Copyright (c) 2008 Damien Miller. * Copyright (c) 1996, 1997, 2000-2002 Michael Shalayeff. * Copyright (c) 2013 Markus Friedl. * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU Public License, in which case the provisions of the GPL are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * The bootblocks pre-fill the kernel .openbsd.randomdata section with seed * material (on-disk from previous boot, hopefully mixed with a hardware rng). * The first arc4random(9) call initializes this seed material as a chacha * state. Calls can be done early in kernel bootstrap code -- early use is * encouraged. * * After the kernel timeout subsystem is initialized, random_start() prepares * the entropy collection mechanism enqueue_randomness() and timeout-driven * mixing into the chacha state. The first submissions come from device * probes, later on interrupt-time submissions are more common. Entropy * data (and timing information) get mixed over the entropy input ring * rnd_event_space[] -- the goal is to collect damage. * * Based upon timeouts, a selection of the entropy ring rnd_event_space[] * CRC bit-distributed and XOR mixed into entropy_pool[]. * * From time to time, entropy_pool[] is SHA512-whitened, mixed with time * information again, XOR'd with the inner and outer states of the existing * chacha state, to create a new chacha state. * * During early boot (until cold=0), enqueue operations are immediately * dequeued, and mixed into the chacha. */ #include <sys/param.h> #include <sys/event.h> #include <sys/ioctl.h> #include <sys/malloc.h> #include <sys/timeout.h> #include <sys/atomic.h> #include <sys/task.h> #include <sys/msgbuf.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <crypto/sha2.h> #define KEYSTREAM_ONLY #include <crypto/chacha_private.h> #include <uvm/uvm_extern.h> /* * For the purposes of better mixing, we use the CRC-32 polynomial as * well to make a twisted Generalized Feedback Shift Register * * (See M. Matsumoto & Y. Kurita, 1992. Twisted GFSR generators. ACM * Transactions on Modeling and Computer Simulation 2(3):179-194. * Also see M. Matsumoto & Y. Kurita, 1994. Twisted GFSR generators * II. ACM Transactions on Modeling and Computer Simulation 4:254-266) */ /* * Stirring polynomials over GF(2) for various pool sizes. Used in * add_entropy_words() below. * * The polynomial terms are chosen to be evenly spaced (minimum RMS * distance from evenly spaced; except for the last tap, which is 1 to * get the twisting happening as fast as possible. * * The resultant polynomial is: * 2^POOLWORDS + 2^POOL_TAP1 + 2^POOL_TAP2 + 2^POOL_TAP3 + 2^POOL_TAP4 + 1 */ #define POOLWORDS 2048 #define POOLBYTES (POOLWORDS*4) #define POOLMASK (POOLWORDS - 1) #define POOL_TAP1 1638 #define POOL_TAP2 1231 #define POOL_TAP3 819 #define POOL_TAP4 411 /* * Raw entropy collection from device drivers; at interrupt context or not. * enqueue_randomness() is used to submit data into the entropy input ring. */ #define QEVLEN 128 /* must be a power of 2 */ #define QEVCONSUME 8 /* how many events to consume a time */ #define KEYSZ 32 #define IVSZ 8 #define BLOCKSZ 64 #define RSBUFSZ (16*BLOCKSZ) #define EBUFSIZE KEYSZ + IVSZ struct rand_event { u_int re_time; u_int re_val; } rnd_event_space[QEVLEN]; u_int rnd_event_cons; u_int rnd_event_prod; int rnd_cold = 1; int rnd_slowextract = 1; void rnd_reinit(void *v); /* timeout to start reinit */ void rnd_init(void *); /* actually do the reinit */ static u_int32_t entropy_pool[POOLWORDS]; u_int32_t entropy_pool0[POOLWORDS] __attribute__((section(".openbsd.randomdata"))); void dequeue_randomness(void *); void add_entropy_words(const u_int32_t *, u_int); void extract_entropy(u_int8_t *) __attribute__((__bounded__(__minbytes__,1,EBUFSIZE))); struct timeout rnd_timeout = TIMEOUT_INITIALIZER(dequeue_randomness, NULL); int filt_randomread(struct knote *, long); void filt_randomdetach(struct knote *); int filt_randomwrite(struct knote *, long); static void _rs_seed(u_char *, size_t); static void _rs_clearseed(const void *p, size_t s); const struct filterops randomread_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_randomdetach, .f_event = filt_randomread, }; const struct filterops randomwrite_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_randomdetach, .f_event = filt_randomwrite, }; /* * This function mixes entropy and timing into the entropy input ring. */ void enqueue_randomness(u_int val) { struct rand_event *rep; int e; e = (atomic_inc_int_nv(&rnd_event_prod) - 1) & (QEVLEN-1); rep = &rnd_event_space[e]; rep->re_time += cpu_rnd_messybits(); rep->re_val += val; if (rnd_cold) { dequeue_randomness(NULL); rnd_init(NULL); if (!cold) rnd_cold = 0; } else if (!timeout_pending(&rnd_timeout) && (rnd_event_prod - rnd_event_cons) > QEVCONSUME) { rnd_slowextract = min(rnd_slowextract * 2, 5000); timeout_add_msec(&rnd_timeout, rnd_slowextract * 10); } } /* * This function merges entropy ring information into the buffer using * a polynomial to spread the bits. */ void add_entropy_words(const u_int32_t *buf, u_int n) { /* derived from IEEE 802.3 CRC-32 */ static const u_int32_t twist_table[8] = { 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; static u_int entropy_add_ptr; static u_char entropy_input_rotate; for (; n--; buf++) { u_int32_t w = (*buf << entropy_input_rotate) | (*buf >> ((32 - entropy_input_rotate) & 31)); u_int i = entropy_add_ptr = (entropy_add_ptr - 1) & POOLMASK; /* * Normally, we add 7 bits of rotation to the pool. * At the beginning of the pool, add an extra 7 bits * rotation, so that successive passes spread the * input bits across the pool evenly. */ entropy_input_rotate = (entropy_input_rotate + (i ? 7 : 14)) & 31; /* XOR pool contents corresponding to polynomial terms */ w ^= entropy_pool[(i + POOL_TAP1) & POOLMASK] ^ entropy_pool[(i + POOL_TAP2) & POOLMASK] ^ entropy_pool[(i + POOL_TAP3) & POOLMASK] ^ entropy_pool[(i + POOL_TAP4) & POOLMASK] ^ entropy_pool[(i + 1) & POOLMASK] ^ entropy_pool[i]; /* + 2^POOLWORDS */ entropy_pool[i] = (w >> 3) ^ twist_table[w & 7]; } } /* * Pulls entropy out of the queue and merges it into the poll with the * CRC. This takes a mix of fresh entries from the producer end of the * queue and entries from the consumer end of the queue which are * likely to have collected more damage. */ /* ARGSUSED */ void dequeue_randomness(void *v) { u_int32_t buf[2]; u_int startp, startc, i; if (!rnd_cold) timeout_del(&rnd_timeout); /* Some very new damage */ startp = rnd_event_prod - QEVCONSUME; for (i = 0; i < QEVCONSUME; i++) { u_int e = (startp + i) & (QEVLEN-1); buf[0] = rnd_event_space[e].re_time; buf[1] = rnd_event_space[e].re_val; add_entropy_words(buf, 2); } /* and some probably more damaged */ startc = rnd_event_cons; for (i = 0; i < QEVCONSUME; i++) { u_int e = (startc + i) & (QEVLEN-1); buf[0] = rnd_event_space[e].re_time; buf[1] = rnd_event_space[e].re_val; add_entropy_words(buf, 2); } rnd_event_cons = startp + QEVCONSUME; } /* * Grabs a chunk from the entropy_pool[] and slams it through SHA512 when * requested. */ void extract_entropy(u_int8_t *buf) { static u_int32_t extract_pool[POOLWORDS]; u_char digest[SHA512_DIGEST_LENGTH]; SHA2_CTX shactx; #if SHA512_DIGEST_LENGTH < EBUFSIZE #error "need more bigger hash output" #endif /* * INTENTIONALLY not protected by any lock. Races during * memcpy() result in acceptable input data; races during * SHA512Update() would create nasty data dependencies. We * do not rely on this as a benefit, but if it happens, cool. */ memcpy(extract_pool, entropy_pool, sizeof(extract_pool)); /* Hash the pool to get the output */ SHA512Init(&shactx); SHA512Update(&shactx, (u_int8_t *)extract_pool, sizeof(extract_pool)); SHA512Final(digest, &shactx); /* Copy data to destination buffer */ memcpy(buf, digest, EBUFSIZE); /* * Modify pool so next hash will produce different results. * During boot-time enqueue/dequeue stage, avoid recursion. */ if (!rnd_cold) enqueue_randomness(extract_pool[0]); dequeue_randomness(NULL); /* Wipe data from memory */ explicit_bzero(extract_pool, sizeof(extract_pool)); explicit_bzero(digest, sizeof(digest)); } /* random keystream by ChaCha */ struct mutex rndlock = MUTEX_INITIALIZER(IPL_HIGH); struct timeout rndreinit_timeout = TIMEOUT_INITIALIZER(rnd_reinit, NULL); struct task rnd_task = TASK_INITIALIZER(rnd_init, NULL); static chacha_ctx rs; /* chacha context for random keystream */ /* keystream blocks (also chacha seed from boot) */ static u_char rs_buf[RSBUFSZ]; u_char rs_buf0[RSBUFSZ] __attribute__((section(".openbsd.randomdata"))); static size_t rs_have; /* valid bytes at end of rs_buf */ static size_t rs_count; /* bytes till reseed */ void suspend_randomness(void) { struct timespec ts; getnanotime(&ts); enqueue_randomness(ts.tv_sec); enqueue_randomness(ts.tv_nsec); dequeue_randomness(NULL); rs_count = 0; arc4random_buf(entropy_pool, sizeof(entropy_pool)); } void resume_randomness(char *buf, size_t buflen) { struct timespec ts; if (buf && buflen) _rs_seed(buf, buflen); getnanotime(&ts); enqueue_randomness(ts.tv_sec); enqueue_randomness(ts.tv_nsec); dequeue_randomness(NULL); rs_count = 0; } static inline void _rs_rekey(u_char *dat, size_t datlen); static inline void _rs_init(u_char *buf, size_t n) { KASSERT(n >= KEYSZ + IVSZ); chacha_keysetup(&rs, buf, KEYSZ * 8); chacha_ivsetup(&rs, buf + KEYSZ, NULL); } static void _rs_seed(u_char *buf, size_t n) { _rs_rekey(buf, n); /* invalidate rs_buf */ rs_have = 0; memset(rs_buf, 0, sizeof(rs_buf)); rs_count = 1600000; } static void _rs_stir(int do_lock) { struct timespec ts; u_int8_t buf[EBUFSIZE], *p; int i; /* * Use SHA512 PRNG data and a system timespec; early in the boot * process this is the best we can do -- some architectures do * not collect entropy very well during this time, but may have * clock information which is better than nothing. */ extract_entropy(buf); nanotime(&ts); for (p = (u_int8_t *)&ts, i = 0; i < sizeof(ts); i++) buf[i] ^= p[i]; if (do_lock) mtx_enter(&rndlock); _rs_seed(buf, sizeof(buf)); if (do_lock) mtx_leave(&rndlock); explicit_bzero(buf, sizeof(buf)); /* encourage fast-dequeue again */ rnd_slowextract = 1; } static inline void _rs_stir_if_needed(size_t len) { static int rs_initialized; if (!rs_initialized) { memcpy(entropy_pool, entropy_pool0, sizeof(entropy_pool)); memcpy(rs_buf, rs_buf0, sizeof(rs_buf)); /* seeds cannot be cleaned yet, random_start() will do so */ _rs_init(rs_buf, KEYSZ + IVSZ); rs_count = 1024 * 1024 * 1024; /* until main() runs */ rs_initialized = 1; } else if (rs_count <= len) _rs_stir(0); else rs_count -= len; } static void _rs_clearseed(const void *p, size_t s) { struct kmem_dyn_mode kd_avoidalias; vaddr_t va = trunc_page((vaddr_t)p); vsize_t off = (vaddr_t)p - va; vsize_t len; vaddr_t rwva; paddr_t pa; while (s > 0) { pmap_extract(pmap_kernel(), va, &pa); memset(&kd_avoidalias, 0, sizeof(kd_avoidalias)); kd_avoidalias.kd_prefer = pa; kd_avoidalias.kd_waitok = 1; rwva = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_avoidalias); if (!rwva) panic("_rs_clearseed"); pmap_kenter_pa(rwva, pa, PROT_READ | PROT_WRITE); pmap_update(pmap_kernel()); len = MIN(s, PAGE_SIZE - off); explicit_bzero((void *)(rwva + off), len); pmap_kremove(rwva, PAGE_SIZE); km_free((void *)rwva, PAGE_SIZE, &kv_any, &kp_none); va += PAGE_SIZE; s -= len; off = 0; } } static inline void _rs_rekey(u_char *dat, size_t datlen) { #ifndef KEYSTREAM_ONLY memset(rs_buf, 0, sizeof(rs_buf)); #endif /* fill rs_buf with the keystream */ chacha_encrypt_bytes(&rs, rs_buf, rs_buf, sizeof(rs_buf)); /* mix in optional user provided data */ if (dat) { size_t i, m; m = MIN(datlen, KEYSZ + IVSZ); for (i = 0; i < m; i++) rs_buf[i] ^= dat[i]; } /* immediately reinit for backtracking resistance */ _rs_init(rs_buf, KEYSZ + IVSZ); memset(rs_buf, 0, KEYSZ + IVSZ); rs_have = sizeof(rs_buf) - KEYSZ - IVSZ; } static inline void _rs_random_buf(void *_buf, size_t n) { u_char *buf = (u_char *)_buf; size_t m; _rs_stir_if_needed(n); while (n > 0) { if (rs_have > 0) { m = MIN(n, rs_have); memcpy(buf, rs_buf + sizeof(rs_buf) - rs_have, m); memset(rs_buf + sizeof(rs_buf) - rs_have, 0, m); buf += m; n -= m; rs_have -= m; } if (rs_have == 0) _rs_rekey(NULL, 0); } } static inline void _rs_random_u32(u_int32_t *val) { _rs_stir_if_needed(sizeof(*val)); if (rs_have < sizeof(*val)) _rs_rekey(NULL, 0); memcpy(val, rs_buf + sizeof(rs_buf) - rs_have, sizeof(*val)); memset(rs_buf + sizeof(rs_buf) - rs_have, 0, sizeof(*val)); rs_have -= sizeof(*val); } /* Return one word of randomness from a ChaCha20 generator */ u_int32_t arc4random(void) { u_int32_t ret; mtx_enter(&rndlock); _rs_random_u32(&ret); mtx_leave(&rndlock); return ret; } /* * Fill a buffer of arbitrary length with ChaCha20-derived randomness. */ void arc4random_buf(void *buf, size_t n) { mtx_enter(&rndlock); _rs_random_buf(buf, n); mtx_leave(&rndlock); } /* * Allocate a new ChaCha20 context for the caller to use. */ struct arc4random_ctx * arc4random_ctx_new(void) { char keybuf[KEYSZ + IVSZ]; chacha_ctx *ctx = malloc(sizeof(chacha_ctx), M_TEMP, M_WAITOK); arc4random_buf(keybuf, KEYSZ + IVSZ); chacha_keysetup(ctx, keybuf, KEYSZ * 8); chacha_ivsetup(ctx, keybuf + KEYSZ, NULL); explicit_bzero(keybuf, sizeof(keybuf)); return (struct arc4random_ctx *)ctx; } /* * Free a ChaCha20 context created by arc4random_ctx_new() */ void arc4random_ctx_free(struct arc4random_ctx *ctx) { explicit_bzero(ctx, sizeof(chacha_ctx)); free(ctx, M_TEMP, sizeof(chacha_ctx)); } /* * Use a given ChaCha20 context to fill a buffer */ void arc4random_ctx_buf(struct arc4random_ctx *ctx, void *buf, size_t n) { #ifndef KEYSTREAM_ONLY memset(buf, 0, n); #endif chacha_encrypt_bytes((chacha_ctx *)ctx, buf, buf, n); } /* * Calculate a uniformly distributed random number less than upper_bound * avoiding "modulo bias". * * Uniformity is achieved by generating new random numbers until the one * returned is outside the range [0, 2**32 % upper_bound). This * guarantees the selected random number will be inside * [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound) * after reduction modulo upper_bound. */ u_int32_t arc4random_uniform(u_int32_t upper_bound) { u_int32_t r, min; if (upper_bound < 2) return 0; /* 2**32 % x == (2**32 - x) % x */ min = -upper_bound % upper_bound; /* * This could theoretically loop forever but each retry has * p > 0.5 (worst case, usually far better) of selecting a * number inside the range we need, so it should rarely need * to re-roll. */ for (;;) { r = arc4random(); if (r >= min) break; } return r % upper_bound; } /* ARGSUSED */ void rnd_init(void *null) { _rs_stir(1); } /* * Called by timeout to mark arc4 for stirring, */ void rnd_reinit(void *v) { task_add(systq, &rnd_task); /* 10 minutes, per dm@'s suggestion */ timeout_add_sec(&rndreinit_timeout, 10 * 60); } /* * Start periodic services inside the random subsystem, which pull * entropy forward, hash it, and re-seed the random stream as needed. */ void random_start(int goodseed) { extern char etext[]; #if !defined(NO_PROPOLICE) extern long __guard_local; if (__guard_local == 0) printf("warning: no entropy supplied by boot loader\n"); #endif _rs_clearseed(entropy_pool0, sizeof(entropy_pool0)); _rs_clearseed(rs_buf0, sizeof(rs_buf0)); /* Message buffer may contain data from previous boot */ if (msgbufp->msg_magic == MSG_MAGIC) add_entropy_words((u_int32_t *)msgbufp->msg_bufc, msgbufp->msg_bufs / sizeof(u_int32_t)); add_entropy_words((u_int32_t *)etext - 32*1024, 8192/sizeof(u_int32_t)); dequeue_randomness(NULL); rnd_init(NULL); rnd_reinit(NULL); if (goodseed) printf("random: good seed from bootblocks\n"); else { /* XXX kernel should work harder here */ printf("random: boothowto does not indicate good seed\n"); } } int randomopen(dev_t dev, int flag, int mode, struct proc *p) { return 0; } int randomclose(dev_t dev, int flag, int mode, struct proc *p) { return 0; } /* * Maximum number of bytes to serve directly from the main ChaCha * pool. Larger requests are served from a discrete ChaCha instance keyed * from the main pool. */ #define RND_MAIN_MAX_BYTES 2048 int randomread(dev_t dev, struct uio *uio, int ioflag) { struct arc4random_ctx *lctx = NULL; size_t total = uio->uio_resid; u_char *buf; int ret = 0; if (uio->uio_resid == 0) return 0; buf = malloc(POOLBYTES, M_TEMP, M_WAITOK); if (total > RND_MAIN_MAX_BYTES) lctx = arc4random_ctx_new(); while (ret == 0 && uio->uio_resid > 0) { size_t n = ulmin(POOLBYTES, uio->uio_resid); if (lctx != NULL) arc4random_ctx_buf(lctx, buf, n); else arc4random_buf(buf, n); ret = uiomove(buf, n, uio); if (ret == 0 && uio->uio_resid > 0) yield(); } if (lctx != NULL) arc4random_ctx_free(lctx); explicit_bzero(buf, POOLBYTES); free(buf, M_TEMP, POOLBYTES); return ret; } int randomwrite(dev_t dev, struct uio *uio, int flags) { int ret = 0, newdata = 0; u_int32_t *buf; if (uio->uio_resid == 0) return 0; buf = malloc(POOLBYTES, M_TEMP, M_WAITOK); while (ret == 0 && uio->uio_resid > 0) { size_t n = ulmin(POOLBYTES, uio->uio_resid); ret = uiomove(buf, n, uio); if (ret != 0) break; while (n % sizeof(u_int32_t)) ((u_int8_t *)buf)[n++] = 0; add_entropy_words(buf, n / 4); if (uio->uio_resid > 0) yield(); newdata = 1; } if (newdata) rnd_init(NULL); explicit_bzero(buf, POOLBYTES); free(buf, M_TEMP, POOLBYTES); return ret; } int randomkqfilter(dev_t dev, struct knote *kn) { switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &randomread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &randomwrite_filtops; break; default: return (EINVAL); } return (0); } void filt_randomdetach(struct knote *kn) { } int filt_randomread(struct knote *kn, long hint) { kn->kn_data = RND_MAIN_MAX_BYTES; return (1); } int filt_randomwrite(struct knote *kn, long hint) { kn->kn_data = POOLBYTES; return (1); } int randomioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { switch (cmd) { case FIOASYNC: /* No async flag in softc so this is a no-op. */ break; case FIONBIO: /* Handled in the upper FS layer. */ break; default: return ENOTTY; } return 0; } int sys_getentropy(struct proc *p, void *v, register_t *retval) { struct sys_getentropy_args /* { syscallarg(void *) buf; syscallarg(size_t) nbyte; } */ *uap = v; char buf[256]; int error; if (SCARG(uap, nbyte) > sizeof(buf)) return (EIO); arc4random_buf(buf, SCARG(uap, nbyte)); if ((error = copyout(buf, SCARG(uap, buf), SCARG(uap, nbyte))) != 0) return (error); explicit_bzero(buf, sizeof(buf)); retval[0] = 0; return (0); }
5 4 3 2 4 3 2 1 /* $OpenBSD: kern_sensors.c,v 1.39 2019/12/19 17:40:11 mpi Exp $ */ /* * Copyright (c) 2005 David Gwynne <dlg@openbsd.org> * Copyright (c) 2006 Constantine A. Murenin <cnst+openbsd@bugmail.mojo.ru> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/queue.h> #include <sys/device.h> #include <sys/hotplug.h> #include <sys/timeout.h> #include <sys/task.h> #include <sys/rwlock.h> #include <sys/atomic.h> #include <sys/sensors.h> #include "hotplug.h" struct taskq *sensors_taskq; int sensordev_count; SLIST_HEAD(, ksensordev) sensordev_list = SLIST_HEAD_INITIALIZER(sensordev_list); void sensordev_install(struct ksensordev *sensdev) { struct ksensordev *v, *nv; int s; s = splhigh(); if (sensordev_count == 0) { sensdev->num = 0; SLIST_INSERT_HEAD(&sensordev_list, sensdev, list); } else { for (v = SLIST_FIRST(&sensordev_list); (nv = SLIST_NEXT(v, list)) != NULL; v = nv) if (nv->num - v->num > 1) break; sensdev->num = v->num + 1; SLIST_INSERT_AFTER(v, sensdev, list); } sensordev_count++; splx(s); #if NHOTPLUG > 0 hotplug_device_attach(DV_DULL, "sensordev"); #endif } void sensor_attach(struct ksensordev *sensdev, struct ksensor *sens) { struct ksensor *v, *nv; struct ksensors_head *sh; int s, i; s = splhigh(); sh = &sensdev->sensors_list; if (sensdev->sensors_count == 0) { for (i = 0; i < SENSOR_MAX_TYPES; i++) sensdev->maxnumt[i] = 0; sens->numt = 0; SLIST_INSERT_HEAD(sh, sens, list); } else { for (v = SLIST_FIRST(sh); (nv = SLIST_NEXT(v, list)) != NULL; v = nv) if (v->type == sens->type && (v->type != nv->type || (v->type == nv->type && nv->numt - v->numt > 1))) break; /* sensors of the same type go after each other */ if (v->type == sens->type) sens->numt = v->numt + 1; else sens->numt = 0; SLIST_INSERT_AFTER(v, sens, list); } /* we only increment maxnumt[] if the sensor was added * to the last position of sensors of this type */ if (sensdev->maxnumt[sens->type] == sens->numt) sensdev->maxnumt[sens->type]++; sensdev->sensors_count++; splx(s); } void sensordev_deinstall(struct ksensordev *sensdev) { int s; s = splhigh(); sensordev_count--; SLIST_REMOVE(&sensordev_list, sensdev, ksensordev, list); splx(s); #if NHOTPLUG > 0 hotplug_device_detach(DV_DULL, "sensordev"); #endif } void sensor_detach(struct ksensordev *sensdev, struct ksensor *sens) { struct ksensors_head *sh; int s; s = splhigh(); sh = &sensdev->sensors_list; sensdev->sensors_count--; SLIST_REMOVE(sh, sens, ksensor, list); /* we only decrement maxnumt[] if this is the tail * sensor of this type */ if (sens->numt == sensdev->maxnumt[sens->type] - 1) sensdev->maxnumt[sens->type]--; splx(s); } int sensordev_get(int num, struct ksensordev **sensdev) { struct ksensordev *sd; SLIST_FOREACH(sd, &sensordev_list, list) { if (sd->num == num) { *sensdev = sd; return (0); } if (sd->num > num) return (ENXIO); } return (ENOENT); } int sensor_find(int dev, enum sensor_type type, int numt, struct ksensor **ksensorp) { struct ksensor *s; struct ksensordev *sensdev; struct ksensors_head *sh; int ret; ret = sensordev_get(dev, &sensdev); if (ret) return (ret); sh = &sensdev->sensors_list; SLIST_FOREACH(s, sh, list) if (s->type == type && s->numt == numt) { *ksensorp = s; return (0); } return (ENOENT); } struct sensor_task { void (*func)(void *); void *arg; unsigned int period; struct timeout timeout; struct task task; struct rwlock lock; }; void sensor_task_tick(void *); void sensor_task_work(void *); struct sensor_task * sensor_task_register(void *arg, void (*func)(void *), unsigned int period) { struct sensor_task *st; #ifdef DIAGNOSTIC if (period == 0) panic("sensor_task_register: period is 0"); #endif if (sensors_taskq == NULL && (sensors_taskq = taskq_create("sensors", 1, IPL_HIGH, 0)) == NULL) sensors_taskq = systq; st = malloc(sizeof(*st), M_DEVBUF, M_NOWAIT); if (st == NULL) return (NULL); st->func = func; st->arg = arg; st->period = period; timeout_set(&st->timeout, sensor_task_tick, st); task_set(&st->task, sensor_task_work, st); rw_init(&st->lock, "sensor"); sensor_task_tick(st); return (st); } void sensor_task_unregister(struct sensor_task *st) { /* * we can't reliably timeout_del or task_del because there's a window * between when they come off the lists and the timeout or task code * actually runs the respective handlers for them. mark the sensor_task * as dying by setting period to 0 and let sensor_task_work mop up. */ rw_enter_write(&st->lock); st->period = 0; rw_exit_write(&st->lock); } void sensor_task_tick(void *arg) { struct sensor_task *st = arg; task_add(sensors_taskq, &st->task); } static int sensors_quiesced; static int sensors_running; void sensor_quiesce(void) { sensors_quiesced = 1; while (sensors_running > 0) tsleep_nsec(&sensors_running, PZERO, "sensorpause", INFSLP); } void sensor_restart(void) { sensors_quiesced = 0; } void sensor_task_work(void *xst) { struct sensor_task *st = xst; unsigned int period = 0; atomic_inc_int(&sensors_running); rw_enter_write(&st->lock); period = st->period; if (period > 0 && !sensors_quiesced) st->func(st->arg); rw_exit_write(&st->lock); if (atomic_dec_int_nv(&sensors_running) == 0) { if (sensors_quiesced) wakeup(&sensors_running); } if (period == 0) free(st, M_DEVBUF, sizeof(*st)); else timeout_add_sec(&st->timeout, period); }
3850 3845 3339 32 3333 3851 3847 3846 3351 3348 3348 15 41 41 60 2098 24 2103 2099 18 41 40 3 2099 19 2105 1753 1753 1752 25 2805 3285 595 949 3392 3847 149 150 98 1424 4 1422 1421 1421 1350 1353 160 18 18 /* $OpenBSD: kern_rwlock.c,v 1.47 2021/02/08 08:18:45 mpi Exp $ */ /* * Copyright (c) 2002, 2003 Artur Grabowski <art@openbsd.org> * Copyright (c) 2011 Thordur Bjornsson <thib@secnorth.net> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/rwlock.h> #include <sys/limits.h> #include <sys/atomic.h> #include <sys/witness.h> void rw_do_exit(struct rwlock *, unsigned long); /* XXX - temporary measure until proc0 is properly aligned */ #define RW_PROC(p) (((long)p) & ~RWLOCK_MASK) /* * Other OSes implement more sophisticated mechanism to determine how long the * process attempting to acquire the lock should be spinning. We start with * the most simple approach: we do RW_SPINS attempts at most before eventually * giving up and putting the process to sleep queue. */ #define RW_SPINS 1000 #ifdef MULTIPROCESSOR #define rw_cas(p, o, n) (atomic_cas_ulong(p, o, n) != o) #else static inline int rw_cas(volatile unsigned long *p, unsigned long o, unsigned long n) { if (*p != o) return (1); *p = n; return (0); } #endif /* * Magic wand for lock operations. Every operation checks if certain * flags are set and if they aren't, it increments the lock with some * value (that might need some computing in a few cases). If the operation * fails, we need to set certain flags while waiting for the lock. * * RW_WRITE The lock must be completely empty. We increment it with * RWLOCK_WRLOCK and the proc pointer of the holder. * Sets RWLOCK_WAIT|RWLOCK_WRWANT while waiting. * RW_READ RWLOCK_WRLOCK|RWLOCK_WRWANT may not be set. We increment * with RWLOCK_READ_INCR. RWLOCK_WAIT while waiting. */ static const struct rwlock_op { unsigned long inc; unsigned long check; unsigned long wait_set; long proc_mult; int wait_prio; } rw_ops[] = { { /* RW_WRITE */ RWLOCK_WRLOCK, ULONG_MAX, RWLOCK_WAIT | RWLOCK_WRWANT, 1, PLOCK - 4 }, { /* RW_READ */ RWLOCK_READ_INCR, RWLOCK_WRLOCK, RWLOCK_WAIT, 0, PLOCK }, { /* Sparse Entry. */ 0, }, { /* RW_DOWNGRADE */ RWLOCK_READ_INCR - RWLOCK_WRLOCK, 0, 0, -1, PLOCK }, }; void rw_enter_read(struct rwlock *rwl) { unsigned long owner = rwl->rwl_owner; if (__predict_false((owner & RWLOCK_WRLOCK) || rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR))) rw_enter(rwl, RW_READ); else { membar_enter_after_atomic(); WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL); WITNESS_LOCK(&rwl->rwl_lock_obj, 0); } } void rw_enter_write(struct rwlock *rwl) { struct proc *p = curproc; if (__predict_false(rw_cas(&rwl->rwl_owner, 0, RW_PROC(p) | RWLOCK_WRLOCK))) rw_enter(rwl, RW_WRITE); else { membar_enter_after_atomic(); WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_EXCLUSIVE | LOP_NEWORDER, NULL); WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); } } void rw_exit_read(struct rwlock *rwl) { unsigned long owner; rw_assert_rdlock(rwl); WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); membar_exit_before_atomic(); owner = rwl->rwl_owner; if (__predict_false((owner & RWLOCK_WAIT) || rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR))) rw_do_exit(rwl, 0); } void rw_exit_write(struct rwlock *rwl) { unsigned long owner; rw_assert_wrlock(rwl); WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); membar_exit_before_atomic(); owner = rwl->rwl_owner; if (__predict_false((owner & RWLOCK_WAIT) || rw_cas(&rwl->rwl_owner, owner, 0))) rw_do_exit(rwl, RWLOCK_WRLOCK); } #ifdef DIAGNOSTIC /* * Put the diagnostic functions here to keep the main code free * from ifdef clutter. */ static void rw_enter_diag(struct rwlock *rwl, int flags) { switch (flags & RW_OPMASK) { case RW_WRITE: case RW_READ: if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) panic("rw_enter: %s locking against myself", rwl->rwl_name); break; case RW_DOWNGRADE: /* * If we're downgrading, we must hold the write lock. */ if ((rwl->rwl_owner & RWLOCK_WRLOCK) == 0) panic("rw_enter: %s downgrade of non-write lock", rwl->rwl_name); if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) panic("rw_enter: %s downgrade, not holder", rwl->rwl_name); break; default: panic("rw_enter: unknown op 0x%x", flags); } } #else #define rw_enter_diag(r, f) #endif static void _rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags, const struct lock_type *type) { rwl->rwl_owner = 0; rwl->rwl_name = name; #ifdef WITNESS rwl->rwl_lock_obj.lo_flags = lo_flags; rwl->rwl_lock_obj.lo_name = name; rwl->rwl_lock_obj.lo_type = type; WITNESS_INIT(&rwl->rwl_lock_obj, type); #else (void)type; (void)lo_flags; #endif } void _rw_init_flags(struct rwlock *rwl, const char *name, int flags, const struct lock_type *type) { _rw_init_flags_witness(rwl, name, RWLOCK_LO_FLAGS(flags), type); } int rw_enter(struct rwlock *rwl, int flags) { const struct rwlock_op *op; struct sleep_state sls; unsigned long inc, o; #ifdef MULTIPROCESSOR /* * If process holds the kernel lock, then we want to give up on CPU * as soon as possible so other processes waiting for the kernel lock * can progress. Hence no spinning if we hold the kernel lock. */ unsigned int spin = (_kernel_lock_held()) ? 0 : RW_SPINS; #endif int error, prio; #ifdef WITNESS int lop_flags; lop_flags = LOP_NEWORDER; if (flags & RW_WRITE) lop_flags |= LOP_EXCLUSIVE; if (flags & RW_DUPOK) lop_flags |= LOP_DUPOK; if ((flags & RW_NOSLEEP) == 0 && (flags & RW_DOWNGRADE) == 0) WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); #endif op = &rw_ops[(flags & RW_OPMASK) - 1]; inc = op->inc + RW_PROC(curproc) * op->proc_mult; retry: while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) { unsigned long set = o | op->wait_set; int do_sleep; /* Avoid deadlocks after panic or in DDB */ if (panicstr || db_active) return (0); #ifdef MULTIPROCESSOR /* * It makes sense to try to spin just in case the lock * is acquired by writer. */ if ((o & RWLOCK_WRLOCK) && (spin != 0)) { spin--; CPU_BUSY_CYCLE(); continue; } #endif rw_enter_diag(rwl, flags); if (flags & RW_NOSLEEP) return (EBUSY); prio = op->wait_prio; if (flags & RW_INTR) prio |= PCATCH; sleep_setup(&sls, rwl, prio, rwl->rwl_name, 0); do_sleep = !rw_cas(&rwl->rwl_owner, o, set); error = sleep_finish(&sls, do_sleep); if ((flags & RW_INTR) && (error != 0)) return (error); if (flags & RW_SLEEPFAIL) return (EAGAIN); } if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc))) goto retry; membar_enter_after_atomic(); /* * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we * downgraded a write lock and had possible read waiter, wake them * to let them retry the lock. */ if (__predict_false((o & (RWLOCK_WRLOCK|RWLOCK_WAIT)) == (RWLOCK_WRLOCK|RWLOCK_WAIT))) wakeup(rwl); if (flags & RW_DOWNGRADE) WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); else WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); return (0); } void rw_exit(struct rwlock *rwl) { unsigned long wrlock; /* Avoid deadlocks after panic or in DDB */ if (panicstr || db_active) return; wrlock = rwl->rwl_owner & RWLOCK_WRLOCK; if (wrlock) rw_assert_wrlock(rwl); else rw_assert_rdlock(rwl); WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); membar_exit_before_atomic(); rw_do_exit(rwl, wrlock); } /* membar_exit_before_atomic() has to precede call of this function. */ void rw_do_exit(struct rwlock *rwl, unsigned long wrlock) { unsigned long owner, set; do { owner = rwl->rwl_owner; if (wrlock) set = 0; else set = (owner - RWLOCK_READ_INCR) & ~(RWLOCK_WAIT|RWLOCK_WRWANT); } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set))); if (owner & RWLOCK_WAIT) wakeup(rwl); } int rw_status(struct rwlock *rwl) { unsigned long owner = rwl->rwl_owner; if (owner & RWLOCK_WRLOCK) { if (RW_PROC(curproc) == RW_PROC(owner)) return RW_WRITE; else return RW_WRITE_OTHER; } if (owner) return RW_READ; return (0); } #ifdef DIAGNOSTIC void rw_assert_wrlock(struct rwlock *rwl) { if (panicstr || db_active) return; #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_XLOCKED); #else if (!(rwl->rwl_owner & RWLOCK_WRLOCK)) panic("%s: lock not held", rwl->rwl_name); if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) panic("%s: lock not held by this process", rwl->rwl_name); #endif } void rw_assert_rdlock(struct rwlock *rwl) { if (panicstr || db_active) return; #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_SLOCKED); #else if (!RW_PROC(rwl->rwl_owner) || (rwl->rwl_owner & RWLOCK_WRLOCK)) panic("%s: lock not shared", rwl->rwl_name); #endif } void rw_assert_anylock(struct rwlock *rwl) { if (panicstr || db_active) return; #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_LOCKED); #else switch (rw_status(rwl)) { case RW_WRITE_OTHER: panic("%s: lock held by different process", rwl->rwl_name); case 0: panic("%s: lock not held", rwl->rwl_name); } #endif } void rw_assert_unlocked(struct rwlock *rwl) { if (panicstr || db_active) return; #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_UNLOCKED); #else if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) panic("%s: lock held", rwl->rwl_name); #endif } #endif /* recursive rwlocks; */ void _rrw_init_flags(struct rrwlock *rrwl, const char *name, int flags, const struct lock_type *type) { memset(rrwl, 0, sizeof(struct rrwlock)); _rw_init_flags_witness(&rrwl->rrwl_lock, name, RRWLOCK_LO_FLAGS(flags), type); } int rrw_enter(struct rrwlock *rrwl, int flags) { int rv; if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { if (flags & RW_RECURSEFAIL) return (EDEADLK); else { rrwl->rrwl_wcnt++; WITNESS_LOCK(&rrwl->rrwl_lock.rwl_lock_obj, LOP_EXCLUSIVE); return (0); } } rv = rw_enter(&rrwl->rrwl_lock, flags); if (rv == 0) rrwl->rrwl_wcnt = 1; return (rv); } void rrw_exit(struct rrwlock *rrwl) { if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { KASSERT(rrwl->rrwl_wcnt > 0); rrwl->rrwl_wcnt--; if (rrwl->rrwl_wcnt != 0) { WITNESS_UNLOCK(&rrwl->rrwl_lock.rwl_lock_obj, LOP_EXCLUSIVE); return; } } rw_exit(&rrwl->rrwl_lock); } int rrw_status(struct rrwlock *rrwl) { return (rw_status(&rrwl->rrwl_lock)); } /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #define RWLOCK_OBJ_MAGIC 0x5aa3c85d struct rwlock_obj { struct rwlock ro_lock; u_int ro_magic; u_int ro_refcnt; }; struct pool rwlock_obj_pool; /* * rw_obj_init: * * Initialize the mutex object store. */ void rw_obj_init(void) { pool_init(&rwlock_obj_pool, sizeof(struct rwlock_obj), 0, IPL_MPFLOOR, PR_WAITOK, "rwobjpl", NULL); } /* * rw_obj_alloc: * * Allocate a single lock object. */ void _rw_obj_alloc_flags(struct rwlock **lock, const char *name, int flags, struct lock_type *type) { struct rwlock_obj *mo; mo = pool_get(&rwlock_obj_pool, PR_WAITOK); mo->ro_magic = RWLOCK_OBJ_MAGIC; _rw_init_flags(&mo->ro_lock, name, flags, type); mo->ro_refcnt = 1; *lock = &mo->ro_lock; } /* * rw_obj_hold: * * Add a single reference to a lock object. A reference to the object * must already be held, and must be held across this call. */ void rw_obj_hold(struct rwlock *lock) { struct rwlock_obj *mo = (struct rwlock_obj *)lock; KASSERTMSG(mo->ro_magic == RWLOCK_OBJ_MAGIC, "%s: lock %p: mo->ro_magic (%#x) != RWLOCK_OBJ_MAGIC (%#x)", __func__, mo, mo->ro_magic, RWLOCK_OBJ_MAGIC); KASSERTMSG(mo->ro_refcnt > 0, "%s: lock %p: mo->ro_refcnt (%#x) == 0", __func__, mo, mo->ro_refcnt); atomic_inc_int(&mo->ro_refcnt); } /* * rw_obj_free: * * Drop a reference from a lock object. If the last reference is being * dropped, free the object and return true. Otherwise, return false. */ int rw_obj_free(struct rwlock *lock) { struct rwlock_obj *mo = (struct rwlock_obj *)lock; KASSERTMSG(mo->ro_magic == RWLOCK_OBJ_MAGIC, "%s: lock %p: mo->ro_magic (%#x) != RWLOCK_OBJ_MAGIC (%#x)", __func__, mo, mo->ro_magic, RWLOCK_OBJ_MAGIC); KASSERTMSG(mo->ro_refcnt > 0, "%s: lock %p: mo->ro_refcnt (%#x) == 0", __func__, mo, mo->ro_refcnt); if (atomic_dec_int_nv(&mo->ro_refcnt) > 0) { return false; } #if notyet WITNESS_DESTROY(&mo->ro_lock); #endif pool_put(&rwlock_obj_pool, mo); return true; }
355 339 22 22 22 22 22 22 6 22 5 22 22 6 21 2 2 22 1 1 1 1 1 1 1 1 1 1 2 /* $OpenBSD: ufs_dirhash.c,v 1.42 2019/03/15 05:42:38 kevlo Exp $ */ /* * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This implements a hash-based lookup scheme for UFS directories. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/buf.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/sysctl.h> #include <sys/mutex.h> #include <crypto/siphash.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/dirhash.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> #define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) #define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) #define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen == 0) #define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) int ufs_mindirhashsize; int ufs_dirhashmaxmem; int ufs_dirhashmem; int ufs_dirhashcheck; SIPHASH_KEY ufsdirhash_key; int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen); void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff); void ufsdirhash_delslot(struct dirhash *dh, int slot); int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset); doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset); int ufsdirhash_recycle(int wanted); struct pool ufsdirhash_pool; #define DIRHASHLIST_LOCK() rw_enter_write(&ufsdirhash_mtx) #define DIRHASHLIST_UNLOCK() rw_exit_write(&ufsdirhash_mtx) #define DIRHASH_LOCK(dh) rw_enter_write(&(dh)->dh_mtx) #define DIRHASH_UNLOCK(dh) rw_exit_write(&(dh)->dh_mtx) #define DIRHASH_BLKALLOC_WAITOK() pool_get(&ufsdirhash_pool, PR_WAITOK) #define DIRHASH_BLKFREE(v) pool_put(&ufsdirhash_pool, v) #define mtx_assert(l, f) /* nothing */ #define DIRHASH_ASSERT(e, m) KASSERT((e)) /* Dirhash list; recently-used entries are near the tail. */ TAILQ_HEAD(, dirhash) ufsdirhash_list; /* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ struct rwlock ufsdirhash_mtx; /* * Locking order: * ufsdirhash_mtx * dh_mtx * * The dh_mtx mutex should be acquired either via the inode lock, or via * ufsdirhash_mtx. Only the owner of the inode may free the associated * dirhash, but anything can steal its memory and set dh_hash to NULL. */ /* * Attempt to build up a hash table for the directory contents in * inode 'ip'. Returns 0 on success, or -1 of the operation failed. */ int ufsdirhash_build(struct inode *ip) { struct dirhash *dh; struct buf *bp = NULL; struct direct *ep; struct vnode *vp; doff_t bmask, pos; int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot; /* Check if we can/should use dirhash. */ if (ip->i_dirhash == NULL) { if (DIP(ip, size) < ufs_mindirhashsize || OFSFMT(ip)) return (-1); } else { /* Hash exists, but sysctls could have changed. */ if (DIP(ip, size) < ufs_mindirhashsize || ufs_dirhashmem > ufs_dirhashmaxmem) { ufsdirhash_free(ip); return (-1); } /* Check if hash exists and is intact (note: unlocked read). */ if (ip->i_dirhash->dh_hash != NULL) return (0); /* Free the old, recycled hash and build a new one. */ ufsdirhash_free(ip); } /* Don't hash removed directories. */ if (ip->i_effnlink == 0) return (-1); vp = ip->i_vnode; /* Allocate 50% more entries than this dir size could ever need. */ DIRHASH_ASSERT(DIP(ip, size) >= DIRBLKSIZ, ("ufsdirhash_build size")); nslots = DIP(ip, size) / DIRECTSIZ(1); nslots = (nslots * 3 + 1) / 2; narrays = howmany(nslots, DH_NBLKOFF); nslots = narrays * DH_NBLKOFF; dirblocks = howmany(DIP(ip, size), DIRBLKSIZ); nblocks = (dirblocks * 3 + 1) / 2; memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + nblocks * sizeof(*dh->dh_blkfree); DIRHASHLIST_LOCK(); if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) { DIRHASHLIST_UNLOCK(); if (memreqd > ufs_dirhashmaxmem / 2) return (-1); /* Try to free some space. */ if (ufsdirhash_recycle(memreqd) != 0) return (-1); /* Enough was freed, and list has been locked. */ } ufs_dirhashmem += memreqd; DIRHASHLIST_UNLOCK(); /* * Use non-blocking mallocs so that we will revert to a linear * lookup on failure rather than potentially blocking forever. */ dh = malloc(sizeof(*dh), M_DIRHASH, M_NOWAIT|M_ZERO); if (dh == NULL) { DIRHASHLIST_LOCK(); ufs_dirhashmem -= memreqd; DIRHASHLIST_UNLOCK(); return (-1); } dh->dh_hash = mallocarray(narrays, sizeof(dh->dh_hash[0]), M_DIRHASH, M_NOWAIT|M_ZERO); dh->dh_blkfree = mallocarray(nblocks, sizeof(dh->dh_blkfree[0]), M_DIRHASH, M_NOWAIT | M_ZERO); if (dh->dh_hash == NULL || dh->dh_blkfree == NULL) goto fail; for (i = 0; i < narrays; i++) { if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL) goto fail; for (j = 0; j < DH_NBLKOFF; j++) dh->dh_hash[i][j] = DIRHASH_EMPTY; } /* Initialise the hash table and block statistics. */ rw_init(&dh->dh_mtx, "dirhash"); dh->dh_narrays = narrays; dh->dh_hlen = nslots; dh->dh_nblk = nblocks; dh->dh_dirblks = dirblocks; for (i = 0; i < dirblocks; i++) dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN; for (i = 0; i < DH_NFSTATS; i++) dh->dh_firstfree[i] = -1; dh->dh_firstfree[DH_NFSTATS] = 0; dh->dh_seqopt = 0; dh->dh_seqoff = 0; dh->dh_score = DH_SCOREINIT; ip->i_dirhash = dh; bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; pos = 0; while (pos < DIP(ip, size)) { /* If necessary, get the next directory block. */ if ((pos & bmask) == 0) { if (bp != NULL) brelse(bp); if (UFS_BUFATOFF(ip, (off_t)pos, NULL, &bp) != 0) goto fail; } /* Add this entry to the hash. */ ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); if (ep->d_reclen == 0 || ep->d_reclen > DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) { /* Corrupted directory. */ brelse(bp); goto fail; } if (ep->d_ino != 0) { /* Add the entry (simplified ufsdirhash_add). */ slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); dh->dh_hused++; DH_ENTRY(dh, slot) = pos; ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep)); } pos += ep->d_reclen; } if (bp != NULL) brelse(bp); DIRHASHLIST_LOCK(); TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 1; DIRHASHLIST_UNLOCK(); return (0); fail: if (dh->dh_hash != NULL) { for (i = 0; i < narrays; i++) if (dh->dh_hash[i] != NULL) DIRHASH_BLKFREE(dh->dh_hash[i]); free(dh->dh_hash, M_DIRHASH, narrays * sizeof(dh->dh_hash[0])); } if (dh->dh_blkfree != NULL) free(dh->dh_blkfree, M_DIRHASH, nblocks * sizeof(dh->dh_blkfree[0])); free(dh, M_DIRHASH, sizeof(*dh)); ip->i_dirhash = NULL; DIRHASHLIST_LOCK(); ufs_dirhashmem -= memreqd; DIRHASHLIST_UNLOCK(); return (-1); } /* * Free any hash table associated with inode 'ip'. */ void ufsdirhash_free(struct inode *ip) { struct dirhash *dh; int i, mem; if ((dh = ip->i_dirhash) == NULL) return; DIRHASHLIST_LOCK(); DIRHASH_LOCK(dh); if (dh->dh_onlist) TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); DIRHASH_UNLOCK(dh); DIRHASHLIST_UNLOCK(); /* The dirhash pointed to by 'dh' is exclusively ours now. */ mem = sizeof(*dh); if (dh->dh_hash != NULL) { for (i = 0; i < dh->dh_narrays; i++) DIRHASH_BLKFREE(dh->dh_hash[i]); free(dh->dh_hash, M_DIRHASH, dh->dh_narrays * sizeof(dh->dh_hash[0])); free(dh->dh_blkfree, M_DIRHASH, dh->dh_nblk * sizeof(dh->dh_blkfree[0])); mem += dh->dh_narrays * sizeof(*dh->dh_hash) + dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + dh->dh_nblk * sizeof(*dh->dh_blkfree); } free(dh, M_DIRHASH, sizeof(*dh)); ip->i_dirhash = NULL; DIRHASHLIST_LOCK(); ufs_dirhashmem -= mem; DIRHASHLIST_UNLOCK(); } /* * Find the offset of the specified name within the given inode. * Returns 0 on success, ENOENT if the entry does not exist, or * EJUSTRETURN if the caller should revert to a linear search. * * If successful, the directory offset is stored in *offp, and a * pointer to a struct buf containing the entry is stored in *bpp. If * prevoffp is non-NULL, the offset of the previous entry within * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry * is the first in a block, the start of the block is used). */ int ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp, struct buf **bpp, doff_t *prevoffp) { struct dirhash *dh, *dh_next; struct direct *dp; struct vnode *vp; struct buf *bp; doff_t blkoff, bmask, offset, prevoff; int i, slot; if ((dh = ip->i_dirhash) == NULL) return (EJUSTRETURN); /* * Move this dirhash towards the end of the list if it has a * score higher than the next entry, and acquire the dh_mtx. * Optimise the case where it's already the last by performing * an unlocked read of the TAILQ_NEXT pointer. * * In both cases, end up holding just dh_mtx. */ if (TAILQ_NEXT(dh, dh_list) != NULL) { DIRHASHLIST_LOCK(); DIRHASH_LOCK(dh); /* * If the new score will be greater than that of the next * entry, then move this entry past it. With both mutexes * held, dh_next won't go away, but its dh_score could * change; that's not important since it is just a hint. */ if (dh->dh_hash != NULL && (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && dh->dh_score >= dh_next->dh_score) { DIRHASH_ASSERT(dh->dh_onlist, ("dirhash: not on list")); TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, dh_list); } DIRHASHLIST_UNLOCK(); } else { /* Already the last, though that could change as we wait. */ DIRHASH_LOCK(dh); } if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return (EJUSTRETURN); } /* Update the score. */ if (dh->dh_score < DH_SCOREMAX) dh->dh_score++; vp = ip->i_vnode; bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; blkoff = -1; bp = NULL; restart: slot = ufsdirhash_hash(dh, name, namelen); if (dh->dh_seqopt) { /* * Sequential access optimisation. dh_seqoff contains the * offset of the directory entry immediately following * the last entry that was looked up. Check if this offset * appears in the hash chain for the name we are looking for. */ for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; i = WRAPINCR(i, dh->dh_hlen)) if (offset == dh->dh_seqoff) break; if (offset == dh->dh_seqoff) { /* * We found an entry with the expected offset. This * is probably the entry we want, but if not, the * code below will turn off seqopt and retry. */ slot = i; } else dh->dh_seqopt = 0; } for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; slot = WRAPINCR(slot, dh->dh_hlen)) { if (offset == DIRHASH_DEL) continue; DIRHASH_UNLOCK(dh); if (offset < 0 || offset >= DIP(ip, size)) panic("ufsdirhash_lookup: bad offset in hash array"); if ((offset & ~bmask) != blkoff) { if (bp != NULL) brelse(bp); blkoff = offset & ~bmask; if (UFS_BUFATOFF(ip, (off_t)blkoff, NULL, &bp) != 0) return (EJUSTRETURN); } dp = (struct direct *)(bp->b_data + (offset & bmask)); if (dp->d_reclen == 0 || dp->d_reclen > DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) { /* Corrupted directory. */ brelse(bp); return (EJUSTRETURN); } if (dp->d_namlen == namelen && memcmp(dp->d_name, name, namelen) == 0) { /* Found. Get the prev offset if needed. */ if (prevoffp != NULL) { if (offset & (DIRBLKSIZ - 1)) { prevoff = ufsdirhash_getprev(dp, offset); if (prevoff == -1) { brelse(bp); return (EJUSTRETURN); } } else prevoff = offset; *prevoffp = prevoff; } /* Check for sequential access, and update offset. */ if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) dh->dh_seqopt = 1; dh->dh_seqoff = offset + DIRSIZ(0, dp); *bpp = bp; *offp = offset; return (0); } DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); if (bp != NULL) brelse(bp); ufsdirhash_free(ip); return (EJUSTRETURN); } /* * When the name doesn't match in the seqopt case, go back * and search normally. */ if (dh->dh_seqopt) { dh->dh_seqopt = 0; goto restart; } } DIRHASH_UNLOCK(dh); if (bp != NULL) brelse(bp); return (ENOENT); } /* * Find a directory block with room for 'slotneeded' bytes. Returns * the offset of the directory entry that begins the free space. * This will either be the offset of an existing entry that has free * space at the end, or the offset of an entry with d_ino == 0 at * the start of a DIRBLKSIZ block. * * To use the space, the caller may need to compact existing entries in * the directory. The total number of bytes in all of the entries involved * in the compaction is stored in *slotsize. In other words, all of * the entries that must be compacted are exactly contained in the * region beginning at the returned offset and spanning *slotsize bytes. * * Returns -1 if no space was found, indicating that the directory * must be extended. */ doff_t ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) { struct direct *dp; struct dirhash *dh; struct buf *bp; doff_t pos, slotstart; int dirblock, error, freebytes, i; if ((dh = ip->i_dirhash) == NULL) return (-1); DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return (-1); } /* Find a directory block with the desired free space. */ dirblock = -1; for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) if ((dirblock = dh->dh_firstfree[i]) != -1) break; if (dirblock == -1) { DIRHASH_UNLOCK(dh); return (-1); } DIRHASH_ASSERT(dirblock < dh->dh_nblk && dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN), ("ufsdirhash_findfree: bad stats")); DIRHASH_UNLOCK(dh); pos = dirblock * DIRBLKSIZ; error = UFS_BUFATOFF(ip, (off_t)pos, (char **)&dp, &bp); if (error) return (-1); /* Find the first entry with free space. */ for (i = 0; i < DIRBLKSIZ; ) { if (dp->d_reclen == 0) { brelse(bp); return (-1); } if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp)) break; i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > DIRBLKSIZ) { brelse(bp); return (-1); } slotstart = pos + i; /* Find the range of entries needed to get enough space */ freebytes = 0; while (i < DIRBLKSIZ && freebytes < slotneeded) { freebytes += dp->d_reclen; if (dp->d_ino != 0) freebytes -= DIRSIZ(0, dp); if (dp->d_reclen == 0) { brelse(bp); return (-1); } i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > DIRBLKSIZ) { brelse(bp); return (-1); } if (freebytes < slotneeded) panic("ufsdirhash_findfree: free mismatch"); brelse(bp); *slotsize = pos + i - slotstart; return (slotstart); } /* * Return the start of the unused space at the end of a directory, or * -1 if there are no trailing unused blocks. */ doff_t ufsdirhash_enduseful(struct inode *ip) { struct dirhash *dh; int i; if ((dh = ip->i_dirhash) == NULL) return (-1); DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return (-1); } if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) { DIRHASH_UNLOCK(dh); return (-1); } for (i = dh->dh_dirblks - 1; i >= 0; i--) if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) break; DIRHASH_UNLOCK(dh); return ((doff_t)(i + 1) * DIRBLKSIZ); } /* * Insert information into the hash about a new directory entry. dirp * points to a struct direct containing the entry, and offset specifies * the offset of this entry. */ void ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } DIRHASH_ASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_add: bad offset")); /* * Normal hash usage is < 66%. If the usage gets too high then * remove the hash entirely and let it be rebuilt later. */ if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } /* Find a free hash slot (empty or deleted), and add the entry. */ slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); while (DH_ENTRY(dh, slot) >= 0) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) dh->dh_hused++; DH_ENTRY(dh, slot) = offset; /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp)); DIRHASH_UNLOCK(dh); } /* * Remove the specified directory entry from the hash. The entry to remove * is defined by the name in `dirp', which must exist at the specified * `offset' within the directory. */ void ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } DIRHASH_ASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_remove: bad offset")); /* Find the entry */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); /* Remove the hash entry. */ ufsdirhash_delslot(dh, slot); /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp)); DIRHASH_UNLOCK(dh); } /* * Change the offset associated with a directory entry in the hash. Used * when compacting directory blocks. */ void ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, doff_t newoff) { struct dirhash *dh; int slot; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } DIRHASH_ASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ && newoff < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_move: bad offset")); /* Find the entry, and update the offset. */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); DH_ENTRY(dh, slot) = newoff; DIRHASH_UNLOCK(dh); } /* * Inform dirhash that the directory has grown by one block that * begins at offset (i.e. the new length is offset + DIRBLKSIZ). */ void ufsdirhash_newblk(struct inode *ip, doff_t offset) { struct dirhash *dh; int block; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } DIRHASH_ASSERT(offset == dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_newblk: bad offset")); block = offset / DIRBLKSIZ; if (block >= dh->dh_nblk) { /* Out of space; must rebuild. */ DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } dh->dh_dirblks = block + 1; /* Account for the new free block. */ dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN; if (dh->dh_firstfree[DH_NFSTATS] == -1) dh->dh_firstfree[DH_NFSTATS] = block; DIRHASH_UNLOCK(dh); } /* * Inform dirhash that the directory is being truncated. */ void ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) { struct dirhash *dh; int block, i; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } DIRHASH_ASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_dirtrunc: bad offset")); block = howmany(offset, DIRBLKSIZ); /* * If the directory shrinks to less than 1/8 of dh_nblk blocks * (about 20% of its original size due to the 50% extra added in * ufsdirhash_build) then free it, and let the caller rebuild * if necessary. */ if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } /* * Remove any `first free' information pertaining to the * truncated blocks. All blocks we're removing should be * completely unused. */ if (dh->dh_firstfree[DH_NFSTATS] >= block) dh->dh_firstfree[DH_NFSTATS] = -1; for (i = block; i < dh->dh_dirblks; i++) if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) panic("ufsdirhash_dirtrunc: blocks in use"); for (i = 0; i < DH_NFSTATS; i++) if (dh->dh_firstfree[i] >= block) panic("ufsdirhash_dirtrunc: first free corrupt"); dh->dh_dirblks = block; DIRHASH_UNLOCK(dh); } /* * Debugging function to check that the dirhash information about * a directory block matches its actual contents. Panics if a mismatch * is detected. * * On entry, `buf' should point to the start of an in-core * DIRBLKSIZ-sized directory block, and `offset' should contain the * offset from the start of the directory of that block. */ void ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset) { struct dirhash *dh; struct direct *dp; int block, ffslot, i, nfree; if (!ufs_dirhashcheck) return; if ((dh = ip->i_dirhash) == NULL) return; DIRHASH_LOCK(dh); if (dh->dh_hash == NULL) { DIRHASH_UNLOCK(dh); ufsdirhash_free(ip); return; } block = offset / DIRBLKSIZ; if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks) panic("ufsdirhash_checkblock: bad offset"); nfree = 0; for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) { dp = (struct direct *)(buf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ) panic("ufsdirhash_checkblock: bad dir"); if (dp->d_ino == 0) { #if 0 /* * XXX entries with d_ino == 0 should only occur * at the start of a DIRBLKSIZ block. However the * ufs code is tolerant of such entries at other * offsets, and fsck does not fix them. */ if (i != 0) panic("ufsdirhash_checkblock: bad dir inode"); #endif nfree += dp->d_reclen; continue; } /* Check that the entry exists (will panic if it doesn't). */ ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); nfree += dp->d_reclen - DIRSIZ(0, dp); } if (i != DIRBLKSIZ) panic("ufsdirhash_checkblock: bad dir end"); if (dh->dh_blkfree[block] * DIRALIGN != nfree) panic("ufsdirhash_checkblock: bad free count"); ffslot = BLKFREE2IDX(nfree / DIRALIGN); for (i = 0; i <= DH_NFSTATS; i++) if (dh->dh_firstfree[i] == block && i != ffslot) panic("ufsdirhash_checkblock: bad first-free"); if (dh->dh_firstfree[ffslot] == -1) panic("ufsdirhash_checkblock: missing first-free entry"); DIRHASH_UNLOCK(dh); } /* * Hash the specified filename into a dirhash slot. */ int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen) { return SipHash24(&ufsdirhash_key, name, namelen) % dh->dh_hlen; } /* * Adjust the number of free bytes in the block containing `offset' * by the value specified by `diff'. * * The caller must ensure we have exclusive access to `dh'; normally * that means that dh_mtx should be held, but this is also called * from ufsdirhash_build() where exclusive access can be assumed. */ void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff) { int block, i, nfidx, ofidx; /* Update the per-block summary info. */ block = offset / DIRBLKSIZ; DIRHASH_ASSERT(block < dh->dh_nblk && block < dh->dh_dirblks, ("dirhash bad offset")); ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); /* Update the `first free' list if necessary. */ if (ofidx != nfidx) { /* If removing, scan forward for the next block. */ if (dh->dh_firstfree[ofidx] == block) { for (i = block + 1; i < dh->dh_dirblks; i++) if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) break; dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; } /* Make this the new `first free' if necessary */ if (dh->dh_firstfree[nfidx] > block || dh->dh_firstfree[nfidx] == -1) dh->dh_firstfree[nfidx] = block; } } /* * Find the specified name which should have the specified offset. * Returns a slot number, and panics on failure. * * `dh' must be locked on entry and remains so on return. */ int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset) { int slot; mtx_assert(&dh->dh_mtx, MA_OWNED); /* Find the entry. */ DIRHASH_ASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full")); slot = ufsdirhash_hash(dh, name, namelen); while (DH_ENTRY(dh, slot) != offset && DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) != offset) panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); return (slot); } /* * Remove the entry corresponding to the specified slot from the hash array. * * `dh' must be locked on entry and remains so on return. */ void ufsdirhash_delslot(struct dirhash *dh, int slot) { int i; mtx_assert(&dh->dh_mtx, MA_OWNED); /* Mark the entry as deleted. */ DH_ENTRY(dh, slot) = DIRHASH_DEL; /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) i = WRAPINCR(i, dh->dh_hlen); if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { i = WRAPDECR(i, dh->dh_hlen); while (DH_ENTRY(dh, i) == DIRHASH_DEL) { DH_ENTRY(dh, i) = DIRHASH_EMPTY; dh->dh_hused--; i = WRAPDECR(i, dh->dh_hlen); } DIRHASH_ASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen")); } } /* * Given a directory entry and its offset, find the offset of the * previous entry in the same DIRBLKSIZ-sized block. Returns an * offset, or -1 if there is no previous entry in the block or some * other problem occurred. */ doff_t ufsdirhash_getprev(struct direct *dirp, doff_t offset) { struct direct *dp; char *blkbuf; doff_t blkoff, prevoff; int entrypos, i; blkoff = offset & ~(DIRBLKSIZ - 1); /* offset of start of block */ entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */ blkbuf = (char *)dirp - entrypos; prevoff = blkoff; /* If `offset' is the start of a block, there is no previous entry. */ if (entrypos == 0) return (-1); /* Scan from the start of the block until we get to the entry. */ for (i = 0; i < entrypos; i += dp->d_reclen) { dp = (struct direct *)(blkbuf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) return (-1); /* Corrupted directory. */ prevoff = blkoff + i; } return (prevoff); } /* * Try to free up `wanted' bytes by stealing memory from existing * dirhashes. Returns zero with list locked if successful. */ int ufsdirhash_recycle(int wanted) { struct dirhash *dh; doff_t **hash; u_int8_t *blkfree; int i, mem, narrays, nblk; DIRHASHLIST_LOCK(); while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { /* Find a dirhash, and lock it. */ if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) { DIRHASHLIST_UNLOCK(); return (-1); } DIRHASH_LOCK(dh); DIRHASH_ASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list")); /* Decrement the score; only recycle if it becomes zero. */ if (--dh->dh_score > 0) { DIRHASH_UNLOCK(dh); DIRHASHLIST_UNLOCK(); return (-1); } /* Remove it from the list and detach its memory. */ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 0; hash = dh->dh_hash; dh->dh_hash = NULL; blkfree = dh->dh_blkfree; dh->dh_blkfree = NULL; narrays = dh->dh_narrays; nblk = dh->dh_nblk; mem = narrays * sizeof(*dh->dh_hash) + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + dh->dh_nblk * sizeof(*dh->dh_blkfree); /* Unlock everything, free the detached memory. */ DIRHASH_UNLOCK(dh); DIRHASHLIST_UNLOCK(); for (i = 0; i < narrays; i++) DIRHASH_BLKFREE(hash[i]); free(hash, M_DIRHASH, narrays * sizeof(hash[0])); free(blkfree, M_DIRHASH, nblk * sizeof(blkfree[0])); /* Account for the returned memory, and repeat if necessary. */ DIRHASHLIST_LOCK(); ufs_dirhashmem -= mem; } /* Success; return with list locked. */ return (0); } void ufsdirhash_init(void) { pool_init(&ufsdirhash_pool, DH_NBLKOFF * sizeof(doff_t), 0, IPL_NONE, PR_WAITOK, "dirhash", NULL); rw_init(&ufsdirhash_mtx, "dirhash_list"); arc4random_buf(&ufsdirhash_key, sizeof(ufsdirhash_key)); TAILQ_INIT(&ufsdirhash_list); ufs_dirhashmaxmem = 5 * 1024 * 1024; ufs_mindirhashsize = 5 * DIRBLKSIZ; } void ufsdirhash_uninit(void) { DIRHASH_ASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit")); pool_destroy(&ufsdirhash_pool); }
/* * Copyright 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: Christian K├Ânig */ #ifndef __AMDGPU_RING_H__ #define __AMDGPU_RING_H__ #include <drm/amdgpu_drm.h> #include <drm/gpu_scheduler.h> #include <drm/drm_print.h> /* max number of rings */ #define AMDGPU_MAX_RINGS 28 #define AMDGPU_MAX_GFX_RINGS 2 #define AMDGPU_MAX_COMPUTE_RINGS 8 #define AMDGPU_MAX_VCE_RINGS 3 #define AMDGPU_MAX_UVD_ENC_RINGS 2 /* some special values for the owner field */ #define AMDGPU_FENCE_OWNER_UNDEFINED ((void *)0ul) #define AMDGPU_FENCE_OWNER_VM ((void *)1ul) #define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) #define AMDGPU_FENCE_FLAG_INT (1 << 1) #define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2) #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched) enum amdgpu_ring_type { AMDGPU_RING_TYPE_GFX, AMDGPU_RING_TYPE_COMPUTE, AMDGPU_RING_TYPE_SDMA, AMDGPU_RING_TYPE_UVD, AMDGPU_RING_TYPE_VCE, AMDGPU_RING_TYPE_KIQ, AMDGPU_RING_TYPE_UVD_ENC, AMDGPU_RING_TYPE_VCN_DEC, AMDGPU_RING_TYPE_VCN_ENC, AMDGPU_RING_TYPE_VCN_JPEG }; struct amdgpu_device; struct amdgpu_ring; struct amdgpu_ib; struct amdgpu_cs_parser; struct amdgpu_job; /* * Fences. */ struct amdgpu_fence_driver { uint64_t gpu_addr; volatile uint32_t *cpu_addr; /* sync_seq is protected by ring emission lock */ uint32_t sync_seq; atomic_t last_seq; bool initialized; struct amdgpu_irq_src *irq_src; unsigned irq_type; struct timeout fallback_timer; unsigned num_fences_mask; spinlock_t lock; struct dma_fence **fences; }; int amdgpu_fence_driver_init(struct amdgpu_device *adev); void amdgpu_fence_driver_fini(struct amdgpu_device *adev); void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring); int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, unsigned num_hw_submission); int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, struct amdgpu_irq_src *irq_src, unsigned irq_type); void amdgpu_fence_driver_suspend(struct amdgpu_device *adev); void amdgpu_fence_driver_resume(struct amdgpu_device *adev); int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, unsigned flags); int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s); bool amdgpu_fence_process(struct amdgpu_ring *ring); int amdgpu_fence_wait_empty(struct amdgpu_ring *ring); signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring, uint32_t wait_seq, signed long timeout); unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring); /* * Rings. */ /* provided by hw blocks that expose a ring buffer for commands */ struct amdgpu_ring_funcs { enum amdgpu_ring_type type; uint32_t align_mask; u32 nop; bool support_64bit_ptrs; bool no_user_fence; unsigned vmhub; unsigned extra_dw; /* ring read/write ptr handling */ u64 (*get_rptr)(struct amdgpu_ring *ring); u64 (*get_wptr)(struct amdgpu_ring *ring); void (*set_wptr)(struct amdgpu_ring *ring); /* validating and patching of IBs */ int (*parse_cs)(struct amdgpu_cs_parser *p, uint32_t ib_idx); int (*patch_cs_in_place)(struct amdgpu_cs_parser *p, uint32_t ib_idx); /* constants to calculate how many DW are needed for an emit */ unsigned emit_frame_size; unsigned emit_ib_size; /* command emit functions */ void (*emit_ib)(struct amdgpu_ring *ring, struct amdgpu_job *job, struct amdgpu_ib *ib, uint32_t flags); void (*emit_fence)(struct amdgpu_ring *ring, uint64_t addr, uint64_t seq, unsigned flags); void (*emit_pipeline_sync)(struct amdgpu_ring *ring); void (*emit_vm_flush)(struct amdgpu_ring *ring, unsigned vmid, uint64_t pd_addr); void (*emit_hdp_flush)(struct amdgpu_ring *ring); void (*emit_gds_switch)(struct amdgpu_ring *ring, uint32_t vmid, uint32_t gds_base, uint32_t gds_size, uint32_t gws_base, uint32_t gws_size, uint32_t oa_base, uint32_t oa_size); /* testing functions */ int (*test_ring)(struct amdgpu_ring *ring); int (*test_ib)(struct amdgpu_ring *ring, long timeout); /* insert NOP packets */ void (*insert_nop)(struct amdgpu_ring *ring, uint32_t count); void (*insert_start)(struct amdgpu_ring *ring); void (*insert_end)(struct amdgpu_ring *ring); /* pad the indirect buffer to the necessary number of dw */ void (*pad_ib)(struct amdgpu_ring *ring, struct amdgpu_ib *ib); unsigned (*init_cond_exec)(struct amdgpu_ring *ring); void (*patch_cond_exec)(struct amdgpu_ring *ring, unsigned offset); /* note usage for clock and power gating */ void (*begin_use)(struct amdgpu_ring *ring); void (*end_use)(struct amdgpu_ring *ring); void (*emit_switch_buffer) (struct amdgpu_ring *ring); void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags); void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg); void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val); void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val, uint32_t mask); void (*emit_reg_write_reg_wait)(struct amdgpu_ring *ring, uint32_t reg0, uint32_t reg1, uint32_t ref, uint32_t mask); void (*emit_tmz)(struct amdgpu_ring *ring, bool start); /* Try to soft recover the ring to make the fence signal */ void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid); int (*preempt_ib)(struct amdgpu_ring *ring); }; struct amdgpu_ring { struct amdgpu_device *adev; const struct amdgpu_ring_funcs *funcs; struct amdgpu_fence_driver fence_drv; struct drm_gpu_scheduler sched; struct amdgpu_bo *ring_obj; volatile uint32_t *ring; unsigned rptr_offs; u64 wptr; u64 wptr_old; unsigned ring_size; unsigned max_dw; int count_dw; uint64_t gpu_addr; uint64_t ptr_mask; uint32_t buf_mask; u32 idx; u32 me; u32 pipe; u32 queue; struct amdgpu_bo *mqd_obj; uint64_t mqd_gpu_addr; void *mqd_ptr; uint64_t eop_gpu_addr; u32 doorbell_index; bool use_doorbell; bool use_pollmem; unsigned wptr_offs; unsigned fence_offs; uint64_t current_ctx; char name[16]; u32 trail_seq; unsigned trail_fence_offs; u64 trail_fence_gpu_addr; volatile u32 *trail_fence_cpu_addr; unsigned cond_exe_offs; u64 cond_exe_gpu_addr; volatile u32 *cond_exe_cpu_addr; unsigned vm_inv_eng; struct dma_fence *vmid_wait; bool has_compute_vm_bug; atomic_t num_jobs[DRM_SCHED_PRIORITY_MAX]; struct rwlock priority_mutex; /* protected by priority_mutex */ int priority; bool has_high_prio; #if defined(CONFIG_DEBUG_FS) struct dentry *ent; #endif }; #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib))) #define amdgpu_ring_patch_cs_in_place(r, p, ib) ((r)->funcs->patch_cs_in_place((p), (ib))) #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r)) #define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t)) #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r)) #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r)) #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r)) #define amdgpu_ring_emit_ib(r, job, ib, flags) ((r)->funcs->emit_ib((r), (job), (ib), (flags))) #define amdgpu_ring_emit_pipeline_sync(r) (r)->funcs->emit_pipeline_sync((r)) #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr)) #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags)) #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as)) #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r)) #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r)) #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d)) #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d)) #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v)) #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m)) #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b)) #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib))) #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r)) #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o)) #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r) int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw); void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count); void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib); void amdgpu_ring_commit(struct amdgpu_ring *ring); void amdgpu_ring_undo(struct amdgpu_ring *ring); int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, unsigned ring_size, struct amdgpu_irq_src *irq_src, unsigned irq_type); void amdgpu_ring_fini(struct amdgpu_ring *ring); void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring, uint32_t reg0, uint32_t val0, uint32_t reg1, uint32_t val1); bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, struct dma_fence *fence); static inline void amdgpu_ring_set_preempt_cond_exec(struct amdgpu_ring *ring, bool cond_exec) { *ring->cond_exe_cpu_addr = cond_exec; } static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring) { int i = 0; while (i <= ring->buf_mask) ring->ring[i++] = ring->funcs->nop; } static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v) { if (ring->count_dw <= 0) DRM_ERROR("amdgpu: writing more dwords to the ring than expected!\n"); ring->ring[ring->wptr++ & ring->buf_mask] = v; ring->wptr &= ring->ptr_mask; ring->count_dw--; } static inline void amdgpu_ring_write_multiple(struct amdgpu_ring *ring, void *src, int count_dw) { unsigned occupied, chunk1, chunk2; void *dst; if (unlikely(ring->count_dw < count_dw)) DRM_ERROR("amdgpu: writing more dwords to the ring than expected!\n"); occupied = ring->wptr & ring->buf_mask; dst = (void *)&ring->ring[occupied]; chunk1 = ring->buf_mask + 1 - occupied; chunk1 = (chunk1 >= count_dw) ? count_dw: chunk1; chunk2 = count_dw - chunk1; chunk1 <<= 2; chunk2 <<= 2; if (chunk1) memcpy(dst, src, chunk1); if (chunk2) { src += chunk1; dst = (void *)ring->ring; memcpy(dst, src, chunk2); } ring->wptr += count_dw; ring->wptr &= ring->ptr_mask; ring->count_dw -= count_dw; } int amdgpu_ring_test_helper(struct amdgpu_ring *ring); int amdgpu_debugfs_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring); void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring); #endif
/* Public domain. */ #ifndef _LINUX_DELAY_H #define _LINUX_DELAY_H #include <sys/param.h> static inline void udelay(unsigned long usecs) { DELAY(usecs); } static inline void ndelay(unsigned long nsecs) { DELAY(MAX(nsecs / 1000, 1)); } static inline void usleep_range(unsigned long min, unsigned long max) { DELAY((min + max) / 2); } static inline void mdelay(unsigned long msecs) { int loops = msecs; while (loops--) DELAY(1000); } #define drm_msleep(x) mdelay(x) #endif
/* * Copyright (c) 2015 NVIDIA Corporation. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef DRM_SCDC_HELPER_H #define DRM_SCDC_HELPER_H #include <linux/i2c.h> #include <linux/types.h> #define SCDC_SINK_VERSION 0x01 #define SCDC_SOURCE_VERSION 0x02 #define SCDC_UPDATE_0 0x10 #define SCDC_READ_REQUEST_TEST (1 << 2) #define SCDC_CED_UPDATE (1 << 1) #define SCDC_STATUS_UPDATE (1 << 0) #define SCDC_UPDATE_1 0x11 #define SCDC_TMDS_CONFIG 0x20 #define SCDC_TMDS_BIT_CLOCK_RATIO_BY_40 (1 << 1) #define SCDC_TMDS_BIT_CLOCK_RATIO_BY_10 (0 << 1) #define SCDC_SCRAMBLING_ENABLE (1 << 0) #define SCDC_SCRAMBLER_STATUS 0x21 #define SCDC_SCRAMBLING_STATUS (1 << 0) #define SCDC_CONFIG_0 0x30 #define SCDC_READ_REQUEST_ENABLE (1 << 0) #define SCDC_STATUS_FLAGS_0 0x40 #define SCDC_CH2_LOCK (1 << 3) #define SCDC_CH1_LOCK (1 << 2) #define SCDC_CH0_LOCK (1 << 1) #define SCDC_CH_LOCK_MASK (SCDC_CH2_LOCK | SCDC_CH1_LOCK | SCDC_CH0_LOCK) #define SCDC_CLOCK_DETECT (1 << 0) #define SCDC_STATUS_FLAGS_1 0x41 #define SCDC_ERR_DET_0_L 0x50 #define SCDC_ERR_DET_0_H 0x51 #define SCDC_ERR_DET_1_L 0x52 #define SCDC_ERR_DET_1_H 0x53 #define SCDC_ERR_DET_2_L 0x54 #define SCDC_ERR_DET_2_H 0x55 #define SCDC_CHANNEL_VALID (1 << 7) #define SCDC_ERR_DET_CHECKSUM 0x56 #define SCDC_TEST_CONFIG_0 0xc0 #define SCDC_TEST_READ_REQUEST (1 << 7) #define SCDC_TEST_READ_REQUEST_DELAY(x) ((x) & 0x7f) #define SCDC_MANUFACTURER_IEEE_OUI 0xd0 #define SCDC_MANUFACTURER_IEEE_OUI_SIZE 3 #define SCDC_DEVICE_ID 0xd3 #define SCDC_DEVICE_ID_SIZE 8 #define SCDC_DEVICE_HARDWARE_REVISION 0xdb #define SCDC_GET_DEVICE_HARDWARE_REVISION_MAJOR(x) (((x) >> 4) & 0xf) #define SCDC_GET_DEVICE_HARDWARE_REVISION_MINOR(x) (((x) >> 0) & 0xf) #define SCDC_DEVICE_SOFTWARE_MAJOR_REVISION 0xdc #define SCDC_DEVICE_SOFTWARE_MINOR_REVISION 0xdd #define SCDC_MANUFACTURER_SPECIFIC 0xde #define SCDC_MANUFACTURER_SPECIFIC_SIZE 34 ssize_t drm_scdc_read(struct i2c_adapter *adapter, u8 offset, void *buffer, size_t size); ssize_t drm_scdc_write(struct i2c_adapter *adapter, u8 offset, const void *buffer, size_t size); /** * drm_scdc_readb - read a single byte from SCDC * @adapter: I2C adapter * @offset: offset of register to read * @value: return location for the register value * * Reads a single byte from SCDC. This is a convenience wrapper around the * drm_scdc_read() function. * * Returns: * 0 on success or a negative error code on failure. */ static inline int drm_scdc_readb(struct i2c_adapter *adapter, u8 offset, u8 *value) { return drm_scdc_read(adapter, offset, value, sizeof(*value)); } /** * drm_scdc_writeb - write a single byte to SCDC * @adapter: I2C adapter * @offset: offset of register to read * @value: return location for the register value * * Writes a single byte to SCDC. This is a convenience wrapper around the * drm_scdc_write() function. * * Returns: * 0 on success or a negative error code on failure. */ static inline int drm_scdc_writeb(struct i2c_adapter *adapter, u8 offset, u8 value) { return drm_scdc_write(adapter, offset, &value, sizeof(value)); } bool drm_scdc_get_scrambling_status(struct i2c_adapter *adapter); bool drm_scdc_set_scrambling(struct i2c_adapter *adapter, bool enable); bool drm_scdc_set_high_tmds_clock_ratio(struct i2c_adapter *adapter, bool set); #endif
3 3 /* $OpenBSD: nfs_syscalls.c,v 1.117 2021/03/11 13:31:35 jsg Exp $ */ /* $NetBSD: nfs_syscalls.c,v 1.19 1996/02/18 11:53:52 fvdl Exp $ */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_syscalls.c 8.5 (Berkeley) 3/30/95 */ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/malloc.h> #include <sys/buf.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/namei.h> #include <sys/syslog.h> #include <sys/filedesc.h> #include <sys/signalvar.h> #include <sys/kthread.h> #include <sys/queue.h> #include <sys/syscallargs.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <nfs/xdr_subs.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> #include <nfs/nfs.h> #include <nfs/nfsrvcache.h> #include <nfs/nfsmount.h> #include <nfs/nfsnode.h> #include <nfs/nfs_var.h> /* Global defs. */ extern int nfs_numasync; extern struct nfsstats nfsstats; struct nfssvc_sock *nfs_udpsock; int nfsd_waiting = 0; #ifdef NFSSERVER struct pool nfsrv_descript_pl; int nfsrv_getslp(struct nfsd *nfsd); static int nfs_numnfsd = 0; int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *, struct nfssvc_sock *, struct proc *, struct mbuf **) = { nfsrv_null, nfsrv_getattr, nfsrv_setattr, nfsrv_lookup, nfsrv3_access, nfsrv_readlink, nfsrv_read, nfsrv_write, nfsrv_create, nfsrv_mkdir, nfsrv_symlink, nfsrv_mknod, nfsrv_remove, nfsrv_rmdir, nfsrv_rename, nfsrv_link, nfsrv_readdir, nfsrv_readdirplus, nfsrv_statfs, nfsrv_fsinfo, nfsrv_pathconf, nfsrv_commit, nfsrv_noop }; #endif TAILQ_HEAD(, nfssvc_sock) nfssvc_sockhead; struct nfsdhead nfsd_head; int nfssvc_sockhead_flag; #define SLP_INIT 0x01 /* NFS data undergoing initialization */ #define SLP_WANTINIT 0x02 /* thread waiting on NFS initialization */ int nfsd_head_flag; #ifdef NFSCLIENT struct proc *nfs_asyncdaemon[NFS_MAXASYNCDAEMON]; int nfs_niothreads = -1; #endif int nfssvc_addsock(struct file *, struct mbuf *); int nfssvc_nfsd(struct nfsd *); void nfsrv_slpderef(struct nfssvc_sock *); void nfsrv_zapsock(struct nfssvc_sock *); void nfssvc_iod(void *); /* * NFS server pseudo system call for the nfsd's * Based on the flag value it either: * - adds a socket to the selection list * - remains in the kernel as an nfsd */ int sys_nfssvc(struct proc *p, void *v, register_t *retval) { int error = 0; #ifdef NFSSERVER struct sys_nfssvc_args /* { syscallarg(int) flag; syscallarg(caddr_t) argp; } */ *uap = v; int flags = SCARG(uap, flag); struct file *fp; struct mbuf *nam; struct nfsd_args nfsdarg; struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs; struct nfsd *nfsd; #endif /* Must be super user */ error = suser(p); if (error) return (error); #ifndef NFSSERVER error = ENOSYS; #else while (nfssvc_sockhead_flag & SLP_INIT) { nfssvc_sockhead_flag |= SLP_WANTINIT; tsleep_nsec(&nfssvc_sockhead, PSOCK, "nfsd init", INFSLP); } switch (flags) { case NFSSVC_ADDSOCK: error = copyin(SCARG(uap, argp), &nfsdarg, sizeof(nfsdarg)); if (error) return (error); error = getsock(p, nfsdarg.sock, &fp); if (error) return (error); /* * Get the client address for connected sockets. */ if (nfsdarg.name == NULL || nfsdarg.namelen == 0) nam = NULL; else { error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen, MT_SONAME); if (error) { FRELE(fp, p); return (error); } } error = nfssvc_addsock(fp, nam); FRELE(fp, p); break; case NFSSVC_NFSD: error = copyin(SCARG(uap, argp), nsd, sizeof(*nsd)); if (error) return (error); nfsd = malloc(sizeof(*nfsd), M_NFSD, M_WAITOK|M_ZERO); nfsd->nfsd_procp = p; nfsd->nfsd_slp = NULL; error = nfssvc_nfsd(nfsd); break; default: error = EINVAL; break; } if (error == EINTR || error == ERESTART) error = 0; #endif /* !NFSSERVER */ return (error); } #ifdef NFSSERVER /* * Adds a socket to the list for servicing by nfsds. */ int nfssvc_addsock(struct file *fp, struct mbuf *mynam) { struct mbuf *m; int siz; struct nfssvc_sock *slp; struct socket *so; struct nfssvc_sock *tslp; int s, error; so = (struct socket *)fp->f_data; tslp = NULL; /* * Add it to the list, as required. */ if (so->so_proto->pr_protocol == IPPROTO_UDP) { tslp = nfs_udpsock; if (tslp->ns_flag & SLP_VALID) { m_freem(mynam); return (EPERM); } } if (so->so_type == SOCK_STREAM) siz = NFS_MAXPACKET + sizeof (u_long); else siz = NFS_MAXPACKET; s = solock(so); error = soreserve(so, siz, siz); if (error) { sounlock(so, s); m_freem(mynam); return (error); } /* * Set protocol specific options { for now TCP only } and * reserve some space. For datagram sockets, this can get called * repeatedly for the same socket, but that isn't harmful. */ if (so->so_type == SOCK_STREAM) { MGET(m, M_WAIT, MT_SOOPTS); *mtod(m, int32_t *) = 1; m->m_len = sizeof(int32_t); sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); m_freem(m); } if (so->so_proto->pr_domain->dom_family == AF_INET && so->so_proto->pr_protocol == IPPROTO_TCP) { MGET(m, M_WAIT, MT_SOOPTS); *mtod(m, int32_t *) = 1; m->m_len = sizeof(int32_t); sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); m_freem(m); } so->so_rcv.sb_flags &= ~SB_NOINTR; so->so_rcv.sb_timeo_nsecs = INFSLP; so->so_snd.sb_flags &= ~SB_NOINTR; so->so_snd.sb_timeo_nsecs = INFSLP; sounlock(so, s); if (tslp) slp = tslp; else { slp = malloc(sizeof(*slp), M_NFSSVC, M_WAITOK|M_ZERO); TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain); } slp->ns_so = so; slp->ns_nam = mynam; FREF(fp); slp->ns_fp = fp; so->so_upcallarg = (caddr_t)slp; so->so_upcall = nfsrv_rcv; slp->ns_flag = (SLP_VALID | SLP_NEEDQ); nfsrv_wakenfsd(slp); return (0); } /* * Called by nfssvc() for nfsds. Just loops around servicing rpc requests * until it is killed by a signal. */ int nfssvc_nfsd(struct nfsd *nfsd) { struct mbuf *m; int siz; struct nfssvc_sock *slp; struct socket *so; int *solockp; struct nfsrv_descript *nd = NULL; struct mbuf *mreq; int error = 0, cacherep, sotype; cacherep = RC_DOIT; TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain); nfs_numnfsd++; /* Loop getting rpc requests until SIGKILL. */ loop: if (!ISSET(nfsd->nfsd_flag, NFSD_REQINPROG)) { /* attach an nfssvc_sock to nfsd */ error = nfsrv_getslp(nfsd); if (error) goto done; slp = nfsd->nfsd_slp; if (ISSET(slp->ns_flag, SLP_VALID)) { if (ISSET(slp->ns_flag, SLP_DISCONN)) { nfsrv_zapsock(slp); } else if (ISSET(slp->ns_flag, SLP_NEEDQ)) { CLR(slp->ns_flag, SLP_NEEDQ); nfs_sndlock(&slp->ns_solock, NULL); nfsrv_rcv(slp->ns_so, (caddr_t)slp, M_WAIT); nfs_sndunlock(&slp->ns_solock); } error = nfsrv_dorec(slp, nfsd, &nd); SET(nfsd->nfsd_flag, NFSD_REQINPROG); } } else { error = 0; slp = nfsd->nfsd_slp; } if (error || !ISSET(slp->ns_flag, SLP_VALID)) { if (nd != NULL) { pool_put(&nfsrv_descript_pl, nd); nd = NULL; } nfsd->nfsd_slp = NULL; CLR(nfsd->nfsd_flag, NFSD_REQINPROG); nfsrv_slpderef(slp); goto loop; } so = slp->ns_so; sotype = so->so_type; if (ISSET(so->so_proto->pr_flags, PR_CONNREQUIRED)) solockp = &slp->ns_solock; else solockp = NULL; if (nd) { if (nd->nd_nam2) nd->nd_nam = nd->nd_nam2; else nd->nd_nam = slp->ns_nam; } cacherep = nfsrv_getcache(nd, slp, &mreq); switch (cacherep) { case RC_DOIT: error = (*(nfsrv3_procs[nd->nd_procnum]))(nd, slp, nfsd->nfsd_procp, &mreq); if (mreq == NULL) { if (nd != NULL) { m_freem(nd->nd_nam2); m_freem(nd->nd_mrep); } break; } if (error) { nfsstats.srv_errs++; nfsrv_updatecache(nd, 0, mreq); m_freem(nd->nd_nam2); break; } nfsstats.srvrpccnt[nd->nd_procnum]++; nfsrv_updatecache(nd, 1, mreq); nd->nd_mrep = NULL; /* FALLTHROUGH */ case RC_REPLY: m = mreq; siz = 0; while (m) { siz += m->m_len; m = m->m_next; } if (siz <= 0 || siz > NFS_MAXPACKET) panic("bad nfs svc reply, siz = %i", siz); m = mreq; m->m_pkthdr.len = siz; m->m_pkthdr.ph_ifidx = 0; /* For stream protocols, prepend a Sun RPC Record Mark. */ if (sotype == SOCK_STREAM) { M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); *mtod(m, u_int32_t *) = htonl(0x80000000 | siz); } if (solockp) nfs_sndlock(solockp, NULL); if (ISSET(slp->ns_flag, SLP_VALID)) error = nfs_send(so, nd->nd_nam2, m, NULL); else { error = EPIPE; m_freem(m); } m_freem(nd->nd_nam2); m_freem(nd->nd_mrep); if (error == EPIPE) nfsrv_zapsock(slp); if (solockp) nfs_sndunlock(solockp); if (error == EINTR || error == ERESTART) { pool_put(&nfsrv_descript_pl, nd); nfsrv_slpderef(slp); goto done; } break; case RC_DROPIT: m_freem(nd->nd_mrep); m_freem(nd->nd_nam2); break; }; if (nd) { pool_put(&nfsrv_descript_pl, nd); nd = NULL; } if (nfsrv_dorec(slp, nfsd, &nd)) { nfsd->nfsd_flag &= ~NFSD_REQINPROG; nfsd->nfsd_slp = NULL; nfsrv_slpderef(slp); } goto loop; done: TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain); free(nfsd, M_NFSD, sizeof(*nfsd)); if (--nfs_numnfsd == 0) nfsrv_init(1); /* Reinitialize everything */ return (error); } /* * Shut down a socket associated with an nfssvc_sock structure. * Should be called with the send lock set, if required. * The trick here is to increment the sref at the start, so that the nfsds * will stop using it and clear ns_flag at the end so that it will not be * reassigned during cleanup. */ void nfsrv_zapsock(struct nfssvc_sock *slp) { struct socket *so; struct file *fp; struct mbuf *m, *n; slp->ns_flag &= ~SLP_ALLFLAGS; fp = slp->ns_fp; if (fp) { FREF(fp); slp->ns_fp = NULL; so = slp->ns_so; so->so_upcall = NULL; soshutdown(so, SHUT_RDWR); closef(fp, NULL); if (slp->ns_nam) m = m_free(slp->ns_nam); m_freem(slp->ns_raw); m = slp->ns_rec; while (m) { n = m->m_nextpkt; m_freem(m); m = n; } } } /* * Dereference a server socket structure. If it has no more references and * is no longer valid, you can throw it away. */ void nfsrv_slpderef(struct nfssvc_sock *slp) { if (--(slp->ns_sref) == 0 && (slp->ns_flag & SLP_VALID) == 0) { TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain); free(slp, M_NFSSVC, sizeof(*slp)); } } /* * Initialize the data structures for the server. * Handshake with any new nfsds starting up to avoid any chance of * corruption. */ void nfsrv_init(int terminating) { struct nfssvc_sock *slp, *nslp; if (nfssvc_sockhead_flag & SLP_INIT) panic("nfsd init"); nfssvc_sockhead_flag |= SLP_INIT; if (terminating) { for (slp = TAILQ_FIRST(&nfssvc_sockhead); slp != NULL; slp = nslp) { nslp = TAILQ_NEXT(slp, ns_chain); if (slp->ns_flag & SLP_VALID) nfsrv_zapsock(slp); TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain); free(slp, M_NFSSVC, sizeof(*slp)); } nfsrv_cleancache(); /* And clear out server cache */ } TAILQ_INIT(&nfssvc_sockhead); nfssvc_sockhead_flag &= ~SLP_INIT; if (nfssvc_sockhead_flag & SLP_WANTINIT) { nfssvc_sockhead_flag &= ~SLP_WANTINIT; wakeup((caddr_t)&nfssvc_sockhead); } TAILQ_INIT(&nfsd_head); nfsd_head_flag &= ~NFSD_CHECKSLP; nfs_udpsock = malloc(sizeof(*nfs_udpsock), M_NFSSVC, M_WAITOK|M_ZERO); TAILQ_INSERT_HEAD(&nfssvc_sockhead, nfs_udpsock, ns_chain); if (!terminating) { pool_init(&nfsrv_descript_pl, sizeof(struct nfsrv_descript), 0, IPL_NONE, PR_WAITOK, "ndscpl", NULL); } } #endif /* NFSSERVER */ #ifdef NFSCLIENT /* * Asynchronous I/O threads for client nfs. * They do read-ahead and write-behind operations on the block I/O cache. * Never returns unless it fails or gets killed. */ void nfssvc_iod(void *arg) { struct proc *p = curproc; struct buf *bp, *nbp; int i, myiod; struct vnode *vp; int error = 0, s, bufcount; bufcount = MIN(256, bcstats.kvaslots / 8); bufcount = MIN(bufcount, bcstats.numbufs / 8); /* Assign my position or return error if too many already running. */ myiod = -1; for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { if (nfs_asyncdaemon[i] == NULL) { myiod = i; break; } } if (myiod == -1) kthread_exit(EBUSY); nfs_asyncdaemon[myiod] = p; nfs_numasync++; /* Upper limit on how many bufs we'll queue up for this iod. */ if (nfs_bufqmax > bcstats.kvaslots / 4) { nfs_bufqmax = bcstats.kvaslots / 4; bufcount = 0; } if (nfs_bufqmax > bcstats.numbufs / 4) { nfs_bufqmax = bcstats.numbufs / 4; bufcount = 0; } nfs_bufqmax += bufcount; wakeup(&nfs_bufqlen); /* wake up anyone waiting for room to enqueue IO */ /* Just loop around doin our stuff until SIGKILL. */ for (;;) { while (TAILQ_FIRST(&nfs_bufq) == NULL && error == 0) { error = tsleep_nsec(&nfs_bufq, PWAIT | PCATCH, "nfsidl", INFSLP); } while ((bp = TAILQ_FIRST(&nfs_bufq)) != NULL) { /* Take one off the front of the list */ TAILQ_REMOVE(&nfs_bufq, bp, b_freelist); nfs_bufqlen--; wakeup_one(&nfs_bufqlen); if (bp->b_flags & B_READ) (void) nfs_doio(bp, NULL); else do { /* * Look for a delayed write for the same vnode, so I can do * it now. We must grab it before calling nfs_doio() to * avoid any risk of the vnode getting vclean()'d while * we are doing the write rpc. */ vp = bp->b_vp; s = splbio(); LIST_FOREACH(nbp, &vp->v_dirtyblkhd, b_vnbufs) { if ((nbp->b_flags & (B_BUSY|B_DELWRI|B_NEEDCOMMIT|B_NOCACHE))!=B_DELWRI) continue; nbp->b_flags |= B_ASYNC; bremfree(nbp); buf_acquire(nbp); break; } /* * For the delayed write, do the first part of nfs_bwrite() * up to, but not including nfs_strategy(). */ if (nbp) { nbp->b_flags &= ~(B_READ|B_DONE|B_ERROR); buf_undirty(nbp); nbp->b_vp->v_numoutput++; } splx(s); (void) nfs_doio(bp, NULL); } while ((bp = nbp) != NULL); } if (error) { nfs_asyncdaemon[myiod] = NULL; nfs_numasync--; nfs_bufqmax -= bufcount; kthread_exit(error); } } } void nfs_getset_niothreads(int set) { int i, have, start; for (have = 0, i = 0; i < NFS_MAXASYNCDAEMON; i++) if (nfs_asyncdaemon[i] != NULL) have++; if (set) { /* clamp to sane range */ nfs_niothreads = max(0, min(nfs_niothreads, NFS_MAXASYNCDAEMON)); start = nfs_niothreads - have; while (start > 0) { kthread_create(nfssvc_iod, NULL, NULL, "nfsio"); start--; } for (i = 0; (start < 0) && (i < NFS_MAXASYNCDAEMON); i++) if (nfs_asyncdaemon[i] != NULL) { psignal(nfs_asyncdaemon[i], SIGKILL); start++; } } else { if (nfs_niothreads >= 0) nfs_niothreads = have; } } #endif /* NFSCLIENT */ #ifdef NFSSERVER /* * Find an nfssrv_sock for nfsd, sleeping if needed. */ int nfsrv_getslp(struct nfsd *nfsd) { struct nfssvc_sock *slp; int error; again: while (nfsd->nfsd_slp == NULL && (nfsd_head_flag & NFSD_CHECKSLP) == 0) { nfsd->nfsd_flag |= NFSD_WAITING; nfsd_waiting++; error = tsleep_nsec(nfsd, PSOCK | PCATCH, "nfsd", INFSLP); nfsd_waiting--; if (error) return (error); } if (nfsd->nfsd_slp == NULL && (nfsd_head_flag & NFSD_CHECKSLP) != 0) { TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) == (SLP_VALID | SLP_DOREC)) { slp->ns_flag &= ~SLP_DOREC; slp->ns_sref++; nfsd->nfsd_slp = slp; break; } } if (slp == NULL) nfsd_head_flag &= ~NFSD_CHECKSLP; } if (nfsd->nfsd_slp == NULL) goto again; return (0); } #endif /* NFSSERVER */
3 3 3 3 3 3 160 160 160 6 160 160 157 160 3 160 3 3 32 156 157 156 157 3 3 3 3 3 3 3 3 3 /* $OpenBSD: virtio.c,v 1.19 2019/05/26 15:20:04 sf Exp $ */ /* $NetBSD: virtio.c,v 1.3 2011/11/02 23:05:52 njoly Exp $ */ /* * Copyright (c) 2012 Stefan Fritsch, Alexander Fiveg. * Copyright (c) 2010 Minoura Makoto. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/device.h> #include <sys/mutex.h> #include <sys/atomic.h> #include <sys/malloc.h> #include <dev/pv/virtioreg.h> #include <dev/pv/virtiovar.h> #if VIRTIO_DEBUG #define VIRTIO_ASSERT(x) KASSERT(x) #else #define VIRTIO_ASSERT(x) #endif void virtio_init_vq(struct virtio_softc *, struct virtqueue *); void vq_free_entry(struct virtqueue *, struct vq_entry *); struct vq_entry *vq_alloc_entry(struct virtqueue *); struct cfdriver virtio_cd = { NULL, "virtio", DV_DULL }; static const char * const virtio_device_name[] = { "Unknown (0)", /* 0 */ "Network", /* 1 */ "Block", /* 2 */ "Console", /* 3 */ "Entropy", /* 4 */ "Memory Balloon", /* 5 */ "IO Memory", /* 6 */ "Rpmsg", /* 7 */ "SCSI host", /* 8 */ "9P Transport", /* 9 */ "mac80211 wlan" /* 10 */ }; #define NDEVNAMES (sizeof(virtio_device_name)/sizeof(char*)) const char * virtio_device_string(int id) { return id < NDEVNAMES ? virtio_device_name[id] : "Unknown"; } #if VIRTIO_DEBUG static const struct virtio_feature_name transport_feature_names[] = { { VIRTIO_F_NOTIFY_ON_EMPTY, "NotifyOnEmpty"}, { VIRTIO_F_RING_INDIRECT_DESC, "RingIndirectDesc"}, { VIRTIO_F_RING_EVENT_IDX, "RingEventIdx"}, { VIRTIO_F_BAD_FEATURE, "BadFeature"}, { VIRTIO_F_VERSION_1, "Version1"}, { 0, NULL} }; void virtio_log_features(uint64_t host, uint64_t neg, const struct virtio_feature_name *guest_feature_names) { const struct virtio_feature_name *namep; int i; char c; uint32_t bit; for (i = 0; i < 64; i++) { if (i == 30) { /* * VIRTIO_F_BAD_FEATURE is only used for * checking correct negotiation */ continue; } bit = 1 << i; if ((host&bit) == 0) continue; namep = (i < 24 || i > 37) ? guest_feature_names : transport_feature_names; while (namep->bit && namep->bit != bit) namep++; c = (neg&bit) ? '+' : '-'; if (namep->name) printf(" %c%s", c, namep->name); else printf(" %cUnknown(%d)", c, i); } } #endif /* * Reset the device. */ /* * To reset the device to a known state, do following: * virtio_reset(sc); // this will stop the device activity * <dequeue finished requests>; // virtio_dequeue() still can be called * <revoke pending requests in the vqs if any>; * virtio_reinit_start(sc); // dequeue prohibitted * <some other initialization>; * virtio_reinit_end(sc); // device activated; enqueue allowed * Once attached, features are assumed to not change again. */ void virtio_reset(struct virtio_softc *sc) { virtio_device_reset(sc); sc->sc_active_features = 0; } void virtio_reinit_start(struct virtio_softc *sc) { int i; virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_ACK); virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER); virtio_negotiate_features(sc, NULL); for (i = 0; i < sc->sc_nvqs; i++) { int n; struct virtqueue *vq = &sc->sc_vqs[i]; n = virtio_read_queue_size(sc, vq->vq_index); if (n == 0) /* vq disappeared */ continue; if (n != vq->vq_num) { panic("%s: virtqueue size changed, vq index %d\n", sc->sc_dev.dv_xname, vq->vq_index); } virtio_init_vq(sc, vq); virtio_setup_queue(sc, vq, vq->vq_dmamap->dm_segs[0].ds_addr); } } void virtio_reinit_end(struct virtio_softc *sc) { virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK); } /* * dmamap sync operations for a virtqueue. */ static inline void vq_sync_descs(struct virtio_softc *sc, struct virtqueue *vq, int ops) { /* availoffset == sizeof(vring_desc)*vq_num */ bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, 0, vq->vq_availoffset, ops); } static inline void vq_sync_aring(struct virtio_softc *sc, struct virtqueue *vq, int ops) { bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_availoffset, offsetof(struct vring_avail, ring) + vq->vq_num * sizeof(uint16_t), ops); } static inline void vq_sync_uring(struct virtio_softc *sc, struct virtqueue *vq, int ops) { bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, vq->vq_usedoffset, offsetof(struct vring_used, ring) + vq->vq_num * sizeof(struct vring_used_elem), ops); } static inline void vq_sync_indirect(struct virtio_softc *sc, struct virtqueue *vq, int slot, int ops) { int offset = vq->vq_indirectoffset + sizeof(struct vring_desc) * vq->vq_maxnsegs * slot; bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, offset, sizeof(struct vring_desc) * vq->vq_maxnsegs, ops); } /* * Scan vq, bus_dmamap_sync for the vqs (not for the payload), * and calls (*vq_done)() if some entries are consumed. * For use in transport specific irq handlers. */ int virtio_check_vqs(struct virtio_softc *sc) { struct virtqueue *vq; int i, r = 0; /* going backwards is better for if_vio */ for (i = sc->sc_nvqs - 1; i >= 0; i--) { vq = &sc->sc_vqs[i]; if (vq->vq_queued) { vq->vq_queued = 0; vq_sync_aring(sc, vq, BUS_DMASYNC_POSTWRITE); } vq_sync_uring(sc, vq, BUS_DMASYNC_POSTREAD); if (vq->vq_used_idx != vq->vq_used->idx) { if (vq->vq_done) r |= (vq->vq_done)(vq); } } return r; } /* * Initialize vq structure. */ void virtio_init_vq(struct virtio_softc *sc, struct virtqueue *vq) { int i, j; int vq_size = vq->vq_num; memset(vq->vq_vaddr, 0, vq->vq_bytesize); /* build the indirect descriptor chain */ if (vq->vq_indirect != NULL) { struct vring_desc *vd; for (i = 0; i < vq_size; i++) { vd = vq->vq_indirect; vd += vq->vq_maxnsegs * i; for (j = 0; j < vq->vq_maxnsegs-1; j++) vd[j].next = j + 1; } } /* free slot management */ SLIST_INIT(&vq->vq_freelist); /* * virtio_enqueue_trim needs monotonely raising entries, therefore * initialize in reverse order */ for (i = vq_size - 1; i >= 0; i--) { SLIST_INSERT_HEAD(&vq->vq_freelist, &vq->vq_entries[i], qe_list); vq->vq_entries[i].qe_index = i; } /* enqueue/dequeue status */ vq->vq_avail_idx = 0; vq->vq_used_idx = 0; vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE); vq_sync_uring(sc, vq, BUS_DMASYNC_PREREAD); vq->vq_queued = 1; } /* * Allocate/free a vq. * * maxnsegs denotes how much space should be allocated for indirect * descriptors. maxnsegs == 1 can be used to disable use indirect * descriptors for this queue. */ int virtio_alloc_vq(struct virtio_softc *sc, struct virtqueue *vq, int index, int maxsegsize, int maxnsegs, const char *name) { int vq_size, allocsize1, allocsize2, allocsize3, allocsize = 0; int rsegs, r, hdrlen; #define VIRTQUEUE_ALIGN(n) (((n)+(VIRTIO_PAGE_SIZE-1))& \ ~(VIRTIO_PAGE_SIZE-1)) memset(vq, 0, sizeof(*vq)); vq_size = virtio_read_queue_size(sc, index); if (vq_size == 0) { printf("virtqueue not exist, index %d for %s\n", index, name); goto err; } if (((vq_size - 1) & vq_size) != 0) panic("vq_size not power of two: %d", vq_size); hdrlen = virtio_has_feature(sc, VIRTIO_F_RING_EVENT_IDX) ? 3 : 2; /* allocsize1: descriptor table + avail ring + pad */ allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size + sizeof(uint16_t) * (hdrlen + vq_size)); /* allocsize2: used ring + pad */ allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * hdrlen + sizeof(struct vring_used_elem) * vq_size); /* allocsize3: indirect table */ if (sc->sc_indirect && maxnsegs > 1) allocsize3 = sizeof(struct vring_desc) * maxnsegs * vq_size; else allocsize3 = 0; allocsize = allocsize1 + allocsize2 + allocsize3; /* alloc and map the memory */ r = bus_dmamem_alloc(sc->sc_dmat, allocsize, VIRTIO_PAGE_SIZE, 0, &vq->vq_segs[0], 1, &rsegs, BUS_DMA_NOWAIT); if (r != 0) { printf("virtqueue %d for %s allocation failed, error %d\n", index, name, r); goto err; } r = bus_dmamem_map(sc->sc_dmat, &vq->vq_segs[0], 1, allocsize, (caddr_t*)&vq->vq_vaddr, BUS_DMA_NOWAIT); if (r != 0) { printf("virtqueue %d for %s map failed, error %d\n", index, name, r); goto err; } r = bus_dmamap_create(sc->sc_dmat, allocsize, 1, allocsize, 0, BUS_DMA_NOWAIT, &vq->vq_dmamap); if (r != 0) { printf("virtqueue %d for %s dmamap creation failed, " "error %d\n", index, name, r); goto err; } r = bus_dmamap_load(sc->sc_dmat, vq->vq_dmamap, vq->vq_vaddr, allocsize, NULL, BUS_DMA_NOWAIT); if (r != 0) { printf("virtqueue %d for %s dmamap load failed, error %d\n", index, name, r); goto err; } /* remember addresses and offsets for later use */ vq->vq_owner = sc; vq->vq_num = vq_size; vq->vq_mask = vq_size - 1; vq->vq_index = index; vq->vq_desc = vq->vq_vaddr; vq->vq_availoffset = sizeof(struct vring_desc)*vq_size; vq->vq_avail = (struct vring_avail*)(((char*)vq->vq_desc) + vq->vq_availoffset); vq->vq_usedoffset = allocsize1; vq->vq_used = (struct vring_used*)(((char*)vq->vq_desc) + vq->vq_usedoffset); if (allocsize3 > 0) { vq->vq_indirectoffset = allocsize1 + allocsize2; vq->vq_indirect = (void*)(((char*)vq->vq_desc) + vq->vq_indirectoffset); } vq->vq_bytesize = allocsize; vq->vq_maxnsegs = maxnsegs; /* free slot management */ vq->vq_entries = mallocarray(vq_size, sizeof(struct vq_entry), M_DEVBUF, M_NOWAIT | M_ZERO); if (vq->vq_entries == NULL) { r = ENOMEM; goto err; } virtio_init_vq(sc, vq); virtio_setup_queue(sc, vq, vq->vq_dmamap->dm_segs[0].ds_addr); #if VIRTIO_DEBUG printf("\nallocated %u byte for virtqueue %d for %s, size %d\n", allocsize, index, name, vq_size); if (allocsize3 > 0) printf("using %d byte (%d entries) indirect descriptors\n", allocsize3, maxnsegs * vq_size); #endif return 0; err: if (vq->vq_dmamap) bus_dmamap_destroy(sc->sc_dmat, vq->vq_dmamap); if (vq->vq_vaddr) bus_dmamem_unmap(sc->sc_dmat, vq->vq_vaddr, allocsize); if (vq->vq_segs[0].ds_addr) bus_dmamem_free(sc->sc_dmat, &vq->vq_segs[0], 1); memset(vq, 0, sizeof(*vq)); return -1; } int virtio_free_vq(struct virtio_softc *sc, struct virtqueue *vq) { struct vq_entry *qe; int i = 0; /* device must be already deactivated */ /* confirm the vq is empty */ SLIST_FOREACH(qe, &vq->vq_freelist, qe_list) { i++; } if (i != vq->vq_num) { printf("%s: freeing non-empty vq, index %d\n", sc->sc_dev.dv_xname, vq->vq_index); return EBUSY; } /* tell device that there's no virtqueue any longer */ virtio_setup_queue(sc, vq, 0); free(vq->vq_entries, M_DEVBUF, 0); bus_dmamap_unload(sc->sc_dmat, vq->vq_dmamap); bus_dmamap_destroy(sc->sc_dmat, vq->vq_dmamap); bus_dmamem_unmap(sc->sc_dmat, vq->vq_vaddr, vq->vq_bytesize); bus_dmamem_free(sc->sc_dmat, &vq->vq_segs[0], 1); memset(vq, 0, sizeof(*vq)); return 0; } /* * Free descriptor management. */ struct vq_entry * vq_alloc_entry(struct virtqueue *vq) { struct vq_entry *qe; if (SLIST_EMPTY(&vq->vq_freelist)) return NULL; qe = SLIST_FIRST(&vq->vq_freelist); SLIST_REMOVE_HEAD(&vq->vq_freelist, qe_list); return qe; } void vq_free_entry(struct virtqueue *vq, struct vq_entry *qe) { SLIST_INSERT_HEAD(&vq->vq_freelist, qe, qe_list); } /* * Enqueue several dmamaps as a single request. */ /* * Typical usage: * <queue size> number of followings are stored in arrays * - command blocks (in dmamem) should be pre-allocated and mapped * - dmamaps for command blocks should be pre-allocated and loaded * - dmamaps for payload should be pre-allocated * r = virtio_enqueue_prep(sc, vq, &slot); // allocate a slot * if (r) // currently 0 or EAGAIN * return r; * r = bus_dmamap_load(dmat, dmamap_payload[slot], data, count, ..); * if (r) { * virtio_enqueue_abort(sc, vq, slot); * bus_dmamap_unload(dmat, dmamap_payload[slot]); * return r; * } * r = virtio_enqueue_reserve(sc, vq, slot, * dmamap_payload[slot]->dm_nsegs+1); * // ^ +1 for command * if (r) { // currently 0 or EAGAIN * bus_dmamap_unload(dmat, dmamap_payload[slot]); * return r; // do not call abort() * } * <setup and prepare commands> * bus_dmamap_sync(dmat, dmamap_cmd[slot],... BUS_DMASYNC_PREWRITE); * bus_dmamap_sync(dmat, dmamap_payload[slot],...); * virtio_enqueue(sc, vq, slot, dmamap_cmd[slot], 0); * virtio_enqueue(sc, vq, slot, dmamap_payload[slot], iswrite); * virtio_enqueue_commit(sc, vq, slot, 1); * * Alternative usage with statically allocated slots: * <during initialization> * // while not out of slots, do * virtio_enqueue_prep(sc, vq, &slot); // allocate a slot * virtio_enqueue_reserve(sc, vq, slot, max_segs); // reserve all slots * that may ever be needed * * <when enqueing a request> * // Don't call virtio_enqueue_prep() * bus_dmamap_load(dmat, dmamap_payload[slot], data, count, ..); * bus_dmamap_sync(dmat, dmamap_cmd[slot],... BUS_DMASYNC_PREWRITE); * bus_dmamap_sync(dmat, dmamap_payload[slot],...); * virtio_enqueue_trim(sc, vq, slot, num_segs_needed); * virtio_enqueue(sc, vq, slot, dmamap_cmd[slot], 0); * virtio_enqueue(sc, vq, slot, dmamap_payload[slot], iswrite); * virtio_enqueue_commit(sc, vq, slot, 1); * * <when dequeuing> * // don't call virtio_dequeue_commit() */ /* * enqueue_prep: allocate a slot number */ int virtio_enqueue_prep(struct virtqueue *vq, int *slotp) { struct vq_entry *qe1; VIRTIO_ASSERT(slotp != NULL); qe1 = vq_alloc_entry(vq); if (qe1 == NULL) return EAGAIN; /* next slot is not allocated yet */ qe1->qe_next = -1; *slotp = qe1->qe_index; return 0; } /* * enqueue_reserve: allocate remaining slots and build the descriptor chain. * Calls virtio_enqueue_abort() on failure. */ int virtio_enqueue_reserve(struct virtqueue *vq, int slot, int nsegs) { struct vq_entry *qe1 = &vq->vq_entries[slot]; VIRTIO_ASSERT(qe1->qe_next == -1); VIRTIO_ASSERT(1 <= nsegs && nsegs <= vq->vq_num); if (vq->vq_indirect != NULL && nsegs > 1 && nsegs <= vq->vq_maxnsegs) { struct vring_desc *vd; int i; qe1->qe_indirect = 1; vd = &vq->vq_desc[qe1->qe_index]; vd->addr = vq->vq_dmamap->dm_segs[0].ds_addr + vq->vq_indirectoffset; vd->addr += sizeof(struct vring_desc) * vq->vq_maxnsegs * qe1->qe_index; vd->len = sizeof(struct vring_desc) * nsegs; vd->flags = VRING_DESC_F_INDIRECT; vd = vq->vq_indirect; vd += vq->vq_maxnsegs * qe1->qe_index; qe1->qe_desc_base = vd; for (i = 0; i < nsegs-1; i++) vd[i].flags = VRING_DESC_F_NEXT; vd[i].flags = 0; qe1->qe_next = 0; return 0; } else { struct vring_desc *vd; struct vq_entry *qe; int i, s; qe1->qe_indirect = 0; vd = &vq->vq_desc[0]; qe1->qe_desc_base = vd; qe1->qe_next = qe1->qe_index; s = slot; for (i = 0; i < nsegs - 1; i++) { qe = vq_alloc_entry(vq); if (qe == NULL) { vd[s].flags = 0; virtio_enqueue_abort(vq, slot); return EAGAIN; } vd[s].flags = VRING_DESC_F_NEXT; vd[s].next = qe->qe_index; s = qe->qe_index; } vd[s].flags = 0; return 0; } } /* * enqueue: enqueue a single dmamap. */ int virtio_enqueue(struct virtqueue *vq, int slot, bus_dmamap_t dmamap, int write) { struct vq_entry *qe1 = &vq->vq_entries[slot]; struct vring_desc *vd = qe1->qe_desc_base; int i; int s = qe1->qe_next; VIRTIO_ASSERT(s >= 0); VIRTIO_ASSERT(dmamap->dm_nsegs > 0); if (dmamap->dm_nsegs > vq->vq_maxnsegs) { #if VIRTIO_DEBUG for (i = 0; i < dmamap->dm_nsegs; i++) { printf(" %d (%d): %p %lx \n", i, write, (void *)dmamap->dm_segs[i].ds_addr, dmamap->dm_segs[i].ds_len); } #endif panic("dmamap->dm_nseg %d > vq->vq_maxnsegs %d\n", dmamap->dm_nsegs, vq->vq_maxnsegs); } for (i = 0; i < dmamap->dm_nsegs; i++) { vd[s].addr = dmamap->dm_segs[i].ds_addr; vd[s].len = dmamap->dm_segs[i].ds_len; if (!write) vd[s].flags |= VRING_DESC_F_WRITE; s = vd[s].next; } qe1->qe_next = s; return 0; } int virtio_enqueue_p(struct virtqueue *vq, int slot, bus_dmamap_t dmamap, bus_addr_t start, bus_size_t len, int write) { struct vq_entry *qe1 = &vq->vq_entries[slot]; struct vring_desc *vd = qe1->qe_desc_base; int s = qe1->qe_next; VIRTIO_ASSERT(s >= 0); /* XXX todo: handle more segments */ VIRTIO_ASSERT(dmamap->dm_nsegs == 1); VIRTIO_ASSERT((dmamap->dm_segs[0].ds_len > start) && (dmamap->dm_segs[0].ds_len >= start + len)); vd[s].addr = dmamap->dm_segs[0].ds_addr + start; vd[s].len = len; if (!write) vd[s].flags |= VRING_DESC_F_WRITE; qe1->qe_next = vd[s].next; return 0; } static void publish_avail_idx(struct virtio_softc *sc, struct virtqueue *vq) { vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE); virtio_membar_producer(); vq->vq_avail->idx = vq->vq_avail_idx; vq_sync_aring(sc, vq, BUS_DMASYNC_POSTWRITE); vq->vq_queued = 1; } /* * enqueue_commit: add it to the aring. */ void virtio_enqueue_commit(struct virtio_softc *sc, struct virtqueue *vq, int slot, int notifynow) { struct vq_entry *qe1; if (slot < 0) goto notify; vq_sync_descs(sc, vq, BUS_DMASYNC_PREWRITE); qe1 = &vq->vq_entries[slot]; if (qe1->qe_indirect) vq_sync_indirect(sc, vq, slot, BUS_DMASYNC_PREWRITE); vq->vq_avail->ring[(vq->vq_avail_idx++) & vq->vq_mask] = slot; notify: if (notifynow) { if (virtio_has_feature(vq->vq_owner, VIRTIO_F_RING_EVENT_IDX)) { uint16_t o = vq->vq_avail->idx; uint16_t n = vq->vq_avail_idx; uint16_t t; publish_avail_idx(sc, vq); virtio_membar_sync(); t = VQ_AVAIL_EVENT(vq) + 1; if ((uint16_t)(n - t) < (uint16_t)(n - o)) sc->sc_ops->kick(sc, vq->vq_index); } else { publish_avail_idx(sc, vq); virtio_membar_sync(); if (!(vq->vq_used->flags & VRING_USED_F_NO_NOTIFY)) sc->sc_ops->kick(sc, vq->vq_index); } } } /* * enqueue_abort: rollback. */ int virtio_enqueue_abort(struct virtqueue *vq, int slot) { struct vq_entry *qe = &vq->vq_entries[slot]; struct vring_desc *vd; int s; if (qe->qe_next < 0) { vq_free_entry(vq, qe); return 0; } s = slot; vd = &vq->vq_desc[0]; while (vd[s].flags & VRING_DESC_F_NEXT) { s = vd[s].next; vq_free_entry(vq, qe); qe = &vq->vq_entries[s]; } vq_free_entry(vq, qe); return 0; } /* * enqueue_trim: adjust buffer size to given # of segments, a.k.a. * descriptors. */ void virtio_enqueue_trim(struct virtqueue *vq, int slot, int nsegs) { struct vq_entry *qe1 = &vq->vq_entries[slot]; struct vring_desc *vd = &vq->vq_desc[0]; int i; if ((vd[slot].flags & VRING_DESC_F_INDIRECT) == 0) { qe1->qe_next = qe1->qe_index; /* * N.B.: the vq_entries are ASSUMED to be a contiguous * block with slot being the index to the first one. */ } else { qe1->qe_next = 0; vd = &vq->vq_desc[qe1->qe_index]; vd->len = sizeof(struct vring_desc) * nsegs; vd = qe1->qe_desc_base; slot = 0; } for (i = 0; i < nsegs -1 ; i++) { vd[slot].flags = VRING_DESC_F_NEXT; slot++; } vd[slot].flags = 0; } /* * Dequeue a request. */ /* * dequeue: dequeue a request from uring; dmamap_sync for uring is * already done in the interrupt handler. */ int virtio_dequeue(struct virtio_softc *sc, struct virtqueue *vq, int *slotp, int *lenp) { uint16_t slot, usedidx; struct vq_entry *qe; if (vq->vq_used_idx == vq->vq_used->idx) return ENOENT; usedidx = vq->vq_used_idx++; usedidx &= vq->vq_mask; virtio_membar_consumer(); slot = vq->vq_used->ring[usedidx].id; qe = &vq->vq_entries[slot]; if (qe->qe_indirect) vq_sync_indirect(sc, vq, slot, BUS_DMASYNC_POSTWRITE); if (slotp) *slotp = slot; if (lenp) *lenp = vq->vq_used->ring[usedidx].len; return 0; } /* * dequeue_commit: complete dequeue; the slot is recycled for future use. * if you forget to call this the slot will be leaked. * * Don't call this if you use statically allocated slots * and virtio_dequeue_trim(). */ int virtio_dequeue_commit(struct virtqueue *vq, int slot) { struct vq_entry *qe = &vq->vq_entries[slot]; struct vring_desc *vd = &vq->vq_desc[0]; int s = slot; while (vd[s].flags & VRING_DESC_F_NEXT) { s = vd[s].next; vq_free_entry(vq, qe); qe = &vq->vq_entries[s]; } vq_free_entry(vq, qe); return 0; } /* * Increase the event index in order to delay interrupts. * Returns 0 on success; returns 1 if the used ring has already advanced * too far, and the caller must process the queue again (otherewise, no * more interrupts will happen). */ int virtio_postpone_intr(struct virtqueue *vq, uint16_t nslots) { uint16_t idx; idx = vq->vq_used_idx + nslots; /* set the new event index: avail_ring->used_event = idx */ VQ_USED_EVENT(vq) = idx; virtio_membar_sync(); vq_sync_aring(vq->vq_owner, vq, BUS_DMASYNC_PREWRITE); vq->vq_queued++; if (nslots < virtio_nused(vq)) return 1; return 0; } /* * Postpone interrupt until 3/4 of the available descriptors have been * consumed. */ int virtio_postpone_intr_smart(struct virtqueue *vq) { uint16_t nslots; nslots = (uint16_t)(vq->vq_avail->idx - vq->vq_used_idx) * 3 / 4; return virtio_postpone_intr(vq, nslots); } /* * Postpone interrupt until all of the available descriptors have been * consumed. */ int virtio_postpone_intr_far(struct virtqueue *vq) { uint16_t nslots; nslots = (uint16_t)(vq->vq_avail->idx - vq->vq_used_idx); return virtio_postpone_intr(vq, nslots); } /* * Start/stop vq interrupt. No guarantee. */ void virtio_stop_vq_intr(struct virtio_softc *sc, struct virtqueue *vq) { if (virtio_has_feature(sc, VIRTIO_F_RING_EVENT_IDX)) { /* * No way to disable the interrupt completely with * RingEventIdx. Instead advance used_event by half * the possible value. This won't happen soon and * is far enough in the past to not trigger a spurios * interrupt. */ VQ_USED_EVENT(vq) = vq->vq_used_idx + 0x8000; } else { vq->vq_avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; } vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE); vq->vq_queued++; } int virtio_start_vq_intr(struct virtio_softc *sc, struct virtqueue *vq) { /* * If event index feature is negotiated, enabling * interrupts is done through setting the latest * consumed index in the used_event field */ if (virtio_has_feature(sc, VIRTIO_F_RING_EVENT_IDX)) VQ_USED_EVENT(vq) = vq->vq_used_idx; else vq->vq_avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; virtio_membar_sync(); vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE); vq->vq_queued++; if (vq->vq_used_idx != vq->vq_used->idx) return 1; return 0; } /* * Returns a number of slots in the used ring available to * be supplied to the avail ring. */ int virtio_nused(struct virtqueue *vq) { uint16_t n; n = (uint16_t)(vq->vq_used->idx - vq->vq_used_idx); VIRTIO_ASSERT(n <= vq->vq_num); return n; } #if VIRTIO_DEBUG void virtio_vq_dump(struct virtqueue *vq) { /* Common fields */ printf(" + vq num: %d\n", vq->vq_num); printf(" + vq mask: 0x%X\n", vq->vq_mask); printf(" + vq index: %d\n", vq->vq_index); printf(" + vq used idx: %d\n", vq->vq_used_idx); printf(" + vq avail idx: %d\n", vq->vq_avail_idx); printf(" + vq queued: %d\n",vq->vq_queued); /* Avail ring fields */ printf(" + avail flags: 0x%X\n", vq->vq_avail->flags); printf(" + avail idx: %d\n", vq->vq_avail->idx); printf(" + avail event: %d\n", VQ_AVAIL_EVENT(vq)); /* Used ring fields */ printf(" + used flags: 0x%X\n",vq->vq_used->flags); printf(" + used idx: %d\n",vq->vq_used->idx); printf(" + used event: %d\n", VQ_USED_EVENT(vq)); printf(" +++++++++++++++++++++++++++\n"); } #endif
110 112 65 46 46 46 45 17 89 106 106 105 106 98 8 2 2 102 106 107 107 106 50 57 51 84 39 2 39 2 55 13 32 55 30 16 33 13 21 34 32 67 67 11 30 47 45 8 37 45 66 20 13 66 6 5 66 66 64 63 3 66 21 45 45 64 2 57 5 5 3 107 4 108 5 5 5 5 8 2 2 5 5 6 46 16 352 12 283 2 22 1 1 1 1 2 1 2 2 1 2 1 1 2 2 2 6 10 6 31 36 2 1 1 1 1 8 4 2 2 2 2 2 1 1 1 2 1 38 1 1 1 1 1 1 1 1 1 1 1 1 1 2 7 1 1 1 1 3 1 21 29 14 1 1 2 2 2 1 2 1 2 44 44 38 7 2 43 43 42 42 43 45 1 46 43 43 8 5 3 8 8 8 8 18 16 2 167 134 41 10 4 1 2 10 4 5 10 6 10 14 32 13 6 17 15 8 4 14 10 14 23 16 2 4 8 8 3 7 7 73 29 34 34 2 1 1 1 2 1 18 15 3 2 41 33 1 10 29 16 14 35 75 6 2 15 6 7 2 17 11 8 2 4 5 1 1 2 1 2 3 3 5 5 1 2 2 2 1 1 1 9 1 2 2 8 8 2 8 8 10 8 2 66 67 35 35 292 239 67 30 35 2 30 30 35 240 2 95 95 /* $OpenBSD: ip6_output.c,v 1.256 2021/03/10 10:21:49 jsg Exp $ */ /* $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include "pf.h" #include <sys/param.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/errno.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/proc.h> #include <sys/systm.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_enc.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/udp.h> #include <netinet/tcp.h> #include <netinet/ip_var.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/udp_var.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> #include <netinet6/ip6protosw.h> #include <crypto/idgen.h> #if NPF > 0 #include <net/pfvar.h> #endif #ifdef IPSEC #include <netinet/ip_ipsp.h> #include <netinet/ip_ah.h> #include <netinet/ip_esp.h> #ifdef ENCDEBUG #define DPRINTF(x) do { if (encdebug) printf x ; } while (0) #else #define DPRINTF(x) #endif #endif /* IPSEC */ struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, int, int); int ip6_getpcbopt(struct ip6_pktopts *, int, struct mbuf *); int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, int, int, int); int ip6_setmoptions(int, struct ip6_moptions **, struct mbuf *, unsigned int); int ip6_getmoptions(int, struct ip6_moptions *, struct mbuf *); int ip6_copyexthdr(struct mbuf **, caddr_t, int); int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); int ip6_getpmtu(struct rtentry *, struct ifnet *, u_long *); int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *); static __inline u_int16_t __attribute__((__unused__)) in6_cksum_phdr(const struct in6_addr *, const struct in6_addr *, u_int32_t, u_int32_t); void in6_delayed_cksum(struct mbuf *, u_int8_t); /* Context for non-repeating IDs */ struct idgen32_ctx ip6_id_ctx; /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. */ int ip6_output(struct mbuf *m, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp = NULL; struct mbuf_list fml; int hlen, tlen; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, dstsock; int error = 0; u_long mtu; int dontfrag; u_int16_t src_scope, dst_scope; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; u_int8_t sproto = 0; u_char nextproto; #ifdef IPSEC struct tdb *tdb = NULL; #endif /* IPSEC */ #ifdef IPSEC if (inp && (inp->inp_flags & INP_IPV6) == 0) panic("%s: IPv4 pcb is passed", __func__); #endif /* IPSEC */ ip6 = mtod(m, struct ip6_hdr *); finaldst = ip6->ip6_dst; #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (0) bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC if (ipsec_in_use || inp) { tdb = ip6_output_ipsec_lookup(m, &error, inp); if (error != 0) { /* * -EINVAL is used to indicate that the packet should * be silently dropped, typically because we've asked * key management for an SA. */ if (error == -EINVAL) /* Should silently drop packet */ error = 0; goto freehdrs; } } #endif /* IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here. do that later. */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If we need IPsec, or there is at least one extension header, * separate IP6 header from the payload. */ if ((sproto || optlen) && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ { u_char *nexthdrp = &ip6->ip6_nxt; struct mbuf *mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("%s: assumption failed: hdr not split", __func__); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (0) /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); } /* * If there is a routing header, replace the destination address field * with the first hop of the routing header. */ if (exthdrs.ip6e_rthdr) { struct ip6_rthdr *rh; struct ip6_rthdr0 *rh0; struct in6_addr *addr; rh = (struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *)); switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: rh0 = (struct ip6_rthdr0 *)rh; addr = (struct in6_addr *)(rh0 + 1); ip6->ip6_dst = addr[0]; bcopy(&addr[1], &addr[0], sizeof(struct in6_addr) * (rh0->ip6r0_segleft - 1)); addr[rh0->ip6r0_segleft - 1] = finaldst; break; default: /* is it possible? */ error = EINVAL; goto bad; } } /* Source address validation */ if (!(flags & IPV6_UNSPECSRC) && IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* * XXX: we can probably assume validation in the caller, but * we explicitly check the address here for safety. */ error = EOPNOTSUPP; ip6stat_inc(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; ip6stat_inc(ip6s_badscope); goto bad; } ip6stat_inc(ip6s_localout); /* * Route packet. */ #if NPF > 0 reroute: #endif /* initialize cached route */ if (ro == NULL) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = &ro->ro_dst; /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_hlim; else ip6->ip6_hlim = ip6_defmcasthlim; } #ifdef IPSEC if (tdb) { /* * XXX what should we do if ip6_hlim == 0 and the * packet gets tunneled? */ /* * if we are source-routing, do not attempt to tunnel the * packet just because ip6_dst is different from what tdb has. * XXX */ error = ip6_output_ipsec_send(tdb, m, ro, exthdrs.ip6e_rthdr ? 1 : 0, 0); goto done; } #endif /* IPSEC */ bzero(&dstsock, sizeof(dstsock)); dstsock.sin6_family = AF_INET6; dstsock.sin6_addr = ip6->ip6_dst; dstsock.sin6_len = sizeof(dstsock); ro->ro_tableid = m->m_pkthdr.ph_rtableid; if (IN6_IS_ADDR_MULTICAST(&dstsock.sin6_addr)) { struct in6_pktinfo *pi = NULL; /* * If the caller specify the outgoing interface * explicitly, use it. */ if (opt != NULL && (pi = opt->ip6po_pktinfo) != NULL) ifp = if_get(pi->ipi6_ifindex); if (ifp == NULL && im6o != NULL) ifp = if_get(im6o->im6o_ifidx); } if (ifp == NULL) { rt = in6_selectroute(&dstsock, opt, ro, ro->ro_tableid); if (rt == NULL) { ip6stat_inc(ip6s_noroute); error = EHOSTUNREACH; goto bad; } if (ISSET(rt->rt_flags, RTF_LOCAL)) ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid)); else ifp = if_get(rt->rt_ifidx); /* * We aren't using rtisvalid() here because the UP/DOWN state * machine is broken with some Ethernet drivers like em(4). * As a result we might try to use an invalid cached route * entry while an interface is being detached. */ if (ifp == NULL) { ip6stat_inc(ip6s_noroute); error = EHOSTUNREACH; goto bad; } } else { *dst = dstsock; } if (rt && (rt->rt_flags & RTF_GATEWAY) && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) dst = satosin6(rt->rt_gateway); if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* Unicast */ m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { /* Multicast */ m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; /* * Confirm that the outgoing interface supports multicast. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { ip6stat_inc(ip6s_noroute); error = ENETUNREACH; goto bad; } if ((im6o == NULL || im6o->im6o_loop) && in6_hasmulti(&ip6->ip6_dst, ifp)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. * Can't defer TCP/UDP checksumming, do the * computation now. */ in6_proto_cksum_out(m, NULL); ip6_mloopback(ifp, m, dst); } #ifdef MROUTING else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (ip6_mforwarding && ip6_mrouter[ifp->if_rdomain] && (flags & IPV6_FORWARDING) == 0) { if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } #endif /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * If this packet is going through a loopback interface we won't * be able to restore its scope ID using the interface index. */ if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) { if (ifp->if_flags & IFF_LOOPBACK) src_scope = ip6->ip6_src.s6_addr16[1]; ip6->ip6_src.s6_addr16[1] = 0; } if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) { if (ifp->if_flags & IFF_LOOPBACK) dst_scope = ip6->ip6_dst.s6_addr16[1]; ip6->ip6_dst.s6_addr16[1] = 0; } /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu->ro_rt, ifp, &mtu)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t rtalert; /* returned value is ignored */ u_int32_t plen = 0; /* no more than 1 jumbo payload option! */ m->m_pkthdr.ph_ifidx = ifp->if_index; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &rtalert, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_pkthdr.ph_ifidx = 0; } #if NPF > 0 if (pf_test(AF_INET6, PF_OUT, ifp, &m) != PF_PASS) { error = EACCES; m_freem(m); goto done; } if (m == NULL) goto done; ip6 = mtod(m, struct ip6_hdr *); if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) == (PF_TAG_REROUTE | PF_TAG_GENERATED)) { /* already rerun the route lookup, go on */ m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE); } else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) { /* tag as generated to skip over pf_test on rerun */ m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; finaldst = ip6->ip6_dst; ro = NULL; if_put(ifp); /* drop reference since destination changed */ ifp = NULL; goto reroute; } #endif /* * If the packet is not going on the wire it can be destinated * to any local address. In this case do not clear its scopes * to let ip6_input() find a matching local route. */ if (ifp->if_flags & IFF_LOOPBACK) { if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) ip6->ip6_src.s6_addr16[1] = src_scope; if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = dst_scope; } in6_proto_cksum_out(m, ifp); /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu */ tlen = m->m_pkthdr.len; if (ISSET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT)) { CLR(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); dontfrag = 1; } else if (opt && ISSET(opt->ip6po_flags, IP6PO_DONTFRAG)) dontfrag = 1; else dontfrag = 0; if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */ #ifdef IPSEC if (ip_mtudisc) ipsec_adjust_mtu(m, mtu); #endif error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (tlen <= mtu)) { /* case 1-a and 2-a */ error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt); goto done; } /* * try to fragment the packet. case 1-b */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; goto bad; } /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } error = ip6_fragment(m, &fml, hlen, nextproto, mtu); if (error) goto done; while ((m = ml_dequeue(&fml)) != NULL) { error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt); if (error) break; } if (error) ml_purge(&fml); else ip6stat_inc(ip6s_fragmented); done: if_put(ifp); if (ro == &ip6route && ro->ro_rt) { rtfree(ro->ro_rt); } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { rtfree(ro_pmtu->ro_rt); } return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: m_freem(m); goto done; } int ip6_fragment(struct mbuf *m0, struct mbuf_list *fml, int hlen, u_char nextproto, u_long mtu) { struct mbuf *m, *m_frgpart; struct ip6_hdr *mhip6; struct ip6_frag *ip6f; u_int32_t id; int tlen, len, off; int error; ml_init(fml); tlen = m0->m_pkthdr.len; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; goto bad; } id = htonl(ip6_randomid()); /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. */ for (off = hlen; off < tlen; off += len) { struct mbuf *mlast; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { error = ENOBUFS; goto bad; } ml_enqueue(fml, m); if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) goto bad; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *mtod(m0, struct ip6_hdr *); m->m_len = sizeof(*mhip6); if ((error = ip6_insertfraghdr(m0, m, hlen, &ip6f)) != 0) goto bad; ip6f->ip6f_offlg = htons((u_int16_t)((off - hlen) & ~7)); if (off + len >= tlen) len = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_int16_t)(len + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copym(m0, off, len, M_DONTWAIT)) == NULL) { error = ENOBUFS; goto bad; } for (mlast = m; mlast->m_next; mlast = mlast->m_next) ; mlast->m_next = m_frgpart; m->m_pkthdr.len = len + hlen + sizeof(*ip6f); ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; } ip6stat_add(ip6s_ofragments, ml_len(fml)); m_freem(m0); return (0); bad: ip6stat_inc(ip6s_odropped); ml_purge(fml); m_freem(m0); return (error); } int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ MGET(m, M_DONTWAIT, MT_DATA); if (!m) return (ENOBUFS); if (hlen > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (ENOBUFS); } } m->m_len = hlen; if (hdr) memcpy(mtod(m, caddr_t), hdr, hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_int8_t *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == 0) { MGET(mopt, M_DONTWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_int8_t *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (m_trailingspace(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ MGET(n, M_DONTWAIT, MT_DATA); if (n) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; memcpy(mtod(n, caddr_t), mtod(mopt, caddr_t), oldoptlen); optbuf = mtod(n, u_int8_t *) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_int8_t *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 0; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); memcpy(&optbuf[4], &v, sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_DONTWAIT); if (n == NULL) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if ((mlast->m_flags & M_EXT) == 0 && m_trailingspace(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; MGET(mfrg, M_DONTWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } int ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup) { u_int32_t mtu = 0; int error = 0; if (rt != NULL) { mtu = rt->rt_mtu; if (mtu == 0) mtu = ifp->if_mtu; else if (mtu < IPV6_MMTU) { /* RFC8021 IPv6 Atomic Fragments Considered Harmful */ mtu = IPV6_MMTU; } else if (mtu > ifp->if_mtu) { /* * The MTU on the route is larger than the MTU on * the interface! This shouldn't happen, unless the * MTU of the interface has been changed after the * interface was brought up. Change the MTU in the * route to match the interface MTU (as long as the * field isn't locked). */ mtu = ifp->if_mtu; if (!(rt->rt_locks & RTV_MTU)) rt->rt_mtu = mtu; } } else { mtu = ifp->if_mtu; } *mtup = mtu; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(int op, struct socket *so, int level, int optname, struct mbuf *m) { int privileged, optdatalen, uproto; void *optdata; struct inpcb *inp = sotoinpcb(so); int error, optval; struct proc *p = curproc; /* For IPsec and rdomain */ u_int rtid = 0; error = optval = 0; privileged = (inp->inp_socket->so_state & SS_PRIV); uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) return (EINVAL); switch (op) { case PRCO_SETOPT: switch (optname) { /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: if (!privileged) { error = EPERM; break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_MINHOPCOUNT: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_RECVDSTPORT: if (m == NULL || m->m_len != sizeof(int)) { error = EINVAL; break; } optval = *mtod(m, int *); switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ inp->inp_hops = optval; } break; case IPV6_MINHOPCOUNT: if (optval < 0 || optval > 255) error = EINVAL; else inp->inp_ip6_minhlim = optval; break; #define OPTSET(bit) \ do { \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; optp = &inp->inp_outputopts6; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, privileged, uproto); break; } case IPV6_RECVHOPLIMIT: OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDR: OPTSET(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED( &inp->inp_laddr6)) { error = EINVAL; break; } /* No support for IPv4-mapped addresses. */ if (!optval) error = EINVAL; else error = 0; break; case IPV6_RECVTCLASS: OPTSET(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_RECVDSTPORT: OPTSET(IN6P_RECVDSTPORT); break; } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: if (m == NULL || m->m_len != sizeof(optval)) { error = EINVAL; break; } optval = *mtod(m, int *); { struct ip6_pktopts **optp; optp = &inp->inp_outputopts6; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, privileged, uproto); break; } case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { /* new advanced API (RFC3542) */ u_char *optbuf; int optbuflen; struct ip6_pktopts **optp; if (m && m->m_next) { error = EINVAL; /* XXX */ break; } if (m) { optbuf = mtod(m, u_char *); optbuflen = m->m_len; } else { optbuf = NULL; optbuflen = 0; } optp = &inp->inp_outputopts6; error = ip6_pcbopt(optname, optbuf, optbuflen, optp, privileged, uproto); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: error = ip6_setmoptions(optname, &inp->inp_moptions6, m, inp->inp_rtableid); break; case IPV6_PORTRANGE: if (m == NULL || m->m_len != sizeof(int)) { error = EINVAL; break; } optval = *mtod(m, int *); switch (optval) { case IPV6_PORTRANGE_DEFAULT: inp->inp_flags &= ~(IN6P_LOWPORT); inp->inp_flags &= ~(IN6P_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: inp->inp_flags &= ~(IN6P_LOWPORT); inp->inp_flags |= IN6P_HIGHPORT; break; case IPV6_PORTRANGE_LOW: inp->inp_flags &= ~(IN6P_HIGHPORT); inp->inp_flags |= IN6P_LOWPORT; break; default: error = EINVAL; break; } break; case IPSEC6_OUTSA: error = EINVAL; break; case IPV6_AUTH_LEVEL: case IPV6_ESP_TRANS_LEVEL: case IPV6_ESP_NETWORK_LEVEL: case IPV6_IPCOMP_LEVEL: #ifndef IPSEC error = EINVAL; #else if (m == NULL || m->m_len != sizeof(int)) { error = EINVAL; break; } optval = *mtod(m, int *); if (optval < IPSEC_LEVEL_BYPASS || optval > IPSEC_LEVEL_UNIQUE) { error = EINVAL; break; } switch (optname) { case IPV6_AUTH_LEVEL: if (optval < IPSEC_AUTH_LEVEL_DEFAULT && suser(p)) { error = EACCES; break; } inp->inp_seclevel[SL_AUTH] = optval; break; case IPV6_ESP_TRANS_LEVEL: if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT && suser(p)) { error = EACCES; break; } inp->inp_seclevel[SL_ESP_TRANS] = optval; break; case IPV6_ESP_NETWORK_LEVEL: if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT && suser(p)) { error = EACCES; break; } inp->inp_seclevel[SL_ESP_NETWORK] = optval; break; case IPV6_IPCOMP_LEVEL: if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT && suser(p)) { error = EACCES; break; } inp->inp_seclevel[SL_IPCOMP] = optval; break; } #endif break; case SO_RTABLE: if (m == NULL || m->m_len < sizeof(u_int)) { error = EINVAL; break; } rtid = *mtod(m, u_int *); if (inp->inp_rtableid == rtid) break; /* needs privileges to switch when already set */ if (p->p_p->ps_rtableid != rtid && p->p_p->ps_rtableid != 0 && (error = suser(p)) != 0) break; /* table must exist */ if (!rtable_exists(rtid)) { error = EINVAL; break; } if (inp->inp_lport) { error = EBUSY; break; } inp->inp_rtableid = rtid; in_pcbrehash(inp); break; case IPV6_PIPEX: if (m != NULL && m->m_len == sizeof(int)) inp->inp_pipex = *mtod(m, int *); else error = EINVAL; break; default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_MINHOPCOUNT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_RECVDSTPORT: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_UNICAST_HOPS: optval = inp->inp_hops; break; case IPV6_MINHOPCOUNT: optval = inp->inp_ip6_minhlim; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = 1; break; case IPV6_PORTRANGE: { int flags; flags = inp->inp_flags; if (flags & IN6P_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & IN6P_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_RECVDSTPORT: optval = OPTBIT(IN6P_RECVDSTPORT); break; } if (error) break; m->m_len = sizeof(int); *mtod(m, int *) = optval; break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct ifnet *ifp; struct rtentry *rt; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); rt = in_pcbrtentry(inp); if (!rtisvalid(rt)) return (EHOSTUNREACH); ifp = if_get(rt->rt_ifidx); if (ifp == NULL) return (EHOSTUNREACH); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ error = ip6_getpmtu(rt, ifp, &pmtu); if_put(ifp); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); if (optdatalen > MCLBYTES) return (EMSGSIZE); /* XXX */ if (optdatalen > MLEN) MCLGET(m, M_WAIT); m->m_len = optdatalen; bcopy(optdata, mtod(m, void *), optdatalen); break; } case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: error = ip6_getpcbopt(inp->inp_outputopts6, optname, m); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: error = ip6_getmoptions(optname, inp->inp_moptions6, m); break; case IPSEC6_OUTSA: error = EINVAL; break; case IPV6_AUTH_LEVEL: case IPV6_ESP_TRANS_LEVEL: case IPV6_ESP_NETWORK_LEVEL: case IPV6_IPCOMP_LEVEL: #ifndef IPSEC m->m_len = sizeof(int); *mtod(m, int *) = IPSEC_LEVEL_NONE; #else m->m_len = sizeof(int); switch (optname) { case IPV6_AUTH_LEVEL: optval = inp->inp_seclevel[SL_AUTH]; break; case IPV6_ESP_TRANS_LEVEL: optval = inp->inp_seclevel[SL_ESP_TRANS]; break; case IPV6_ESP_NETWORK_LEVEL: optval = inp->inp_seclevel[SL_ESP_NETWORK]; break; case IPV6_IPCOMP_LEVEL: optval = inp->inp_seclevel[SL_IPCOMP]; break; } *mtod(m, int *) = optval; #endif break; case SO_RTABLE: m->m_len = sizeof(u_int); *mtod(m, u_int *) = inp->inp_rtableid; break; case IPV6_PIPEX: m->m_len = sizeof(int); *mtod(m, int *) = inp->inp_pipex; break; default: error = ENOPROTOOPT; break; } break; } return (error); } int ip6_raw_ctloutput(int op, struct socket *so, int level, int optname, struct mbuf *m) { int error = 0, optval; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *inp = sotoinpcb(so); if (level != IPPROTO_IPV6) return (EINVAL); switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case PRCO_SETOPT: if (m == NULL || m->m_len != sizeof(int)) { error = EINVAL; break; } optval = *mtod(m, int *); if (optval < -1 || (optval > 0 && (optval % 2) != 0)) { /* * The API assumes non-negative even offset * values or -1 as a special value. */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else inp->inp_cksum6 = optval; break; case PRCO_GETOPT: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = inp->inp_cksum6; m->m_len = sizeof(int); *mtod(m, int *) = optval; break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; } int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, int priv, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_WAITOK); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, priv, 1, uproto)); } int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct mbuf *m) { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; switch (optname) { case IPV6_PKTINFO: if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo; else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); optdata = (void *)&null_pktinfo; } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("%s: unexpected option", __func__); #endif return (ENOPROTOOPT); } if (optdatalen > MCLBYTES) return (EMSGSIZE); /* XXX */ if (optdatalen > MLEN) MCLGET(m, M_WAIT); m->m_len = optdatalen; if (optdatalen) bcopy(optdata, mtod(m, void *), optdatalen); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT, 0); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT, 0); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT, 0); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT, 0); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { rtfree(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT, 0); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ size_t hlen;\ hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, M_NOWAIT);\ if (dst->type == NULL)\ goto bad;\ memcpy(dst->type, src->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src) { dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, M_NOWAIT); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT, 0); } /* * Set the IP6 multicast options in response to user setsockopt(). */ int ip6_setmoptions(int optname, struct ip6_moptions **im6op, struct mbuf *m, unsigned int rtableid) { int error = 0; u_int loop, ifindex; struct ipv6_mreq *mreq; struct ifnet *ifp; struct ip6_moptions *im6o = *im6op; struct in6_multi_mship *imm; struct proc *p = curproc; /* XXX */ if (im6o == NULL) { /* * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ im6o = malloc(sizeof(*im6o), M_IPMOPTS, M_WAITOK); if (im6o == NULL) return (ENOBUFS); *im6op = im6o; im6o->im6o_ifidx = 0; im6o->im6o_hlim = ip6_defmcasthlim; im6o->im6o_loop = IPV6_DEFAULT_MULTICAST_LOOP; LIST_INIT(&im6o->im6o_memberships); } switch (optname) { case IPV6_MULTICAST_IF: /* * Select the interface for outgoing multicast packets. */ if (m == NULL || m->m_len != sizeof(u_int)) { error = EINVAL; break; } memcpy(&ifindex, mtod(m, u_int *), sizeof(ifindex)); if (ifindex != 0) { ifp = if_get(ifindex); if (ifp == NULL) { error = ENXIO; /* XXX EINVAL? */ break; } if (ifp->if_rdomain != rtable_l2(rtableid) || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; if_put(ifp); break; } if_put(ifp); } im6o->im6o_ifidx = ifindex; break; case IPV6_MULTICAST_HOPS: { /* * Set the IP6 hoplimit for outgoing multicast packets. */ int optval; if (m == NULL || m->m_len != sizeof(int)) { error = EINVAL; break; } memcpy(&optval, mtod(m, u_int *), sizeof(optval)); if (optval < -1 || optval >= 256) error = EINVAL; else if (optval == -1) im6o->im6o_hlim = ip6_defmcasthlim; else im6o->im6o_hlim = optval; break; } case IPV6_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (m == NULL || m->m_len != sizeof(u_int)) { error = EINVAL; break; } memcpy(&loop, mtod(m, u_int *), sizeof(loop)); if (loop > 1) { error = EINVAL; break; } im6o->im6o_loop = loop; break; case IPV6_JOIN_GROUP: /* * Add a multicast group membership. * Group must be a valid IP6 multicast address. */ if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { error = EINVAL; break; } mreq = mtod(m, struct ipv6_mreq *); if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { /* * We use the unspecified address to specify to accept * all multicast addresses. Only super user is allowed * to do this. */ if (suser(p)) { error = EACCES; break; } } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { error = EINVAL; break; } /* * If no interface was explicitly specified, choose an * appropriate one according to the given multicast address. */ if (mreq->ipv6mr_interface == 0) { struct rtentry *rt; struct sockaddr_in6 dst; memset(&dst, 0, sizeof(dst)); dst.sin6_len = sizeof(dst); dst.sin6_family = AF_INET6; dst.sin6_addr = mreq->ipv6mr_multiaddr; rt = rtalloc(sin6tosa(&dst), RT_RESOLVE, rtableid); if (rt == NULL) { error = EADDRNOTAVAIL; break; } ifp = if_get(rt->rt_ifidx); rtfree(rt); } else { /* * If the interface is specified, validate it. */ ifp = if_get(mreq->ipv6mr_interface); if (ifp == NULL) { error = ENXIO; /* XXX EINVAL? */ break; } } /* * See if we found an interface, and confirm that it * supports multicast */ if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) || (ifp->if_flags & IFF_MULTICAST) == 0) { if_put(ifp); error = EADDRNOTAVAIL; break; } /* * Put interface index into the multicast address, * if the address has link/interface-local scope. */ if (IN6_IS_SCOPE_EMBED(&mreq->ipv6mr_multiaddr)) { mreq->ipv6mr_multiaddr.s6_addr16[1] = htons(ifp->if_index); } /* * See if the membership already exists. */ LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) if (imm->i6mm_maddr->in6m_ifidx == ifp->if_index && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &mreq->ipv6mr_multiaddr)) break; if (imm != NULL) { if_put(ifp); error = EADDRINUSE; break; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ imm = in6_joingroup(ifp, &mreq->ipv6mr_multiaddr, &error); if_put(ifp); if (!imm) break; LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); break; case IPV6_LEAVE_GROUP: /* * Drop a multicast group membership. * Group must be a valid IP6 multicast address. */ if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { error = EINVAL; break; } mreq = mtod(m, struct ipv6_mreq *); if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { if (suser(p)) { error = EACCES; break; } } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { error = EINVAL; break; } /* * Put interface index into the multicast address, * if the address has link-local scope. */ if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) { mreq->ipv6mr_multiaddr.s6_addr16[1] = htons(mreq->ipv6mr_interface); } /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq->ipv6mr_interface == 0) ifp = NULL; else { ifp = if_get(mreq->ipv6mr_interface); if (ifp == NULL) { error = ENXIO; /* XXX EINVAL? */ break; } } /* * Find the membership in the membership list. */ LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifidx == ifp->if_index) && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &mreq->ipv6mr_multiaddr)) break; } if_put(ifp); if (imm == NULL) { /* Unable to resolve interface */ error = EADDRNOTAVAIL; break; } /* * Give up the multicast address record to which the * membership points. */ LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); break; default: error = EOPNOTSUPP; break; } /* * If all options have default values, no need to keep the option * structure. */ if (im6o->im6o_ifidx == 0 && im6o->im6o_hlim == ip6_defmcasthlim && im6o->im6o_loop == IPV6_DEFAULT_MULTICAST_LOOP && LIST_EMPTY(&im6o->im6o_memberships)) { free(*im6op, M_IPMOPTS, sizeof(**im6op)); *im6op = NULL; } return (error); } /* * Return the IP6 multicast options in response to user getsockopt(). */ int ip6_getmoptions(int optname, struct ip6_moptions *im6o, struct mbuf *m) { u_int *hlim, *loop, *ifindex; switch (optname) { case IPV6_MULTICAST_IF: ifindex = mtod(m, u_int *); m->m_len = sizeof(u_int); if (im6o == NULL || im6o->im6o_ifidx == 0) *ifindex = 0; else *ifindex = im6o->im6o_ifidx; return (0); case IPV6_MULTICAST_HOPS: hlim = mtod(m, u_int *); m->m_len = sizeof(u_int); if (im6o == NULL) *hlim = ip6_defmcasthlim; else *hlim = im6o->im6o_hlim; return (0); case IPV6_MULTICAST_LOOP: loop = mtod(m, u_int *); m->m_len = sizeof(u_int); if (im6o == NULL) *loop = ip6_defmcasthlim; else *loop = im6o->im6o_loop; return (0); default: return (EOPNOTSUPP); } } /* * Discard the IP6 multicast options. */ void ip6_freemoptions(struct ip6_moptions *im6o) { struct in6_multi_mship *imm; if (im6o == NULL) return; while (!LIST_EMPTY(&im6o->im6o_memberships)) { imm = LIST_FIRST(&im6o->im6o_memberships); LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } free(im6o, M_IPMOPTS, sizeof(*im6o)); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, int priv, int uproto) { u_int clen; struct cmsghdr *cm = 0; caddr_t cmsgs; int error; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); clen = control->m_len; cmsgs = mtod(control, caddr_t); do { if (clen < CMSG_LEN(0)) return (EINVAL); cm = (struct cmsghdr *)cmsgs; if (cm->cmsg_len < CMSG_LEN(0) || cm->cmsg_len > clen || CMSG_ALIGN(cm->cmsg_len) > clen) return (EINVAL); if (cm->cmsg_level == IPPROTO_IPV6) { error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, priv, 0, uproto); if (error) return (error); } clen -= CMSG_ALIGN(cm->cmsg_len); cmsgs += CMSG_ALIGN(cm->cmsg_len); } while (clen); return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. */ int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, int priv, int sticky, int uproto) { int minmtupolicy; switch (optname) { case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (pktinfo->ipi6_ifindex) { ifp = if_get(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); if_put(ifp); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (!priv) return (EPERM); if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); memcpy(opt->ip6po_hbh, hbh, hbhlen); break; } case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (!priv) /* XXX: see the comment for IPV6_HOPOPTS */ return (EPERM); if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); memcpy(*newdest, dest, destlen); break; } case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); memcpy(opt->ip6po_rthdr, rth, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst) { struct mbuf *copym; struct ip6_hdr *ip6; /* * Duplicate the packet. */ copym = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if ((copym->m_flags & M_EXT) != 0 || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } #ifdef DIAGNOSTIC if (copym->m_len < sizeof(*ip6)) { m_freem(copym); return; } #endif ip6 = mtod(copym, struct ip6_hdr *); if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) ip6->ip6_src.s6_addr16[1] = 0; if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = 0; if_input_local(ifp, copym, dst->sin6_family); } /* * Chop IPv6 header off from the payload. */ int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { MGET(mh, M_DONTWAIT, MT_HEADER); if (mh == NULL) { m_freem(m); return ENOBUFS; } M_MOVE_PKTHDR(mh, m); m_align(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } u_int32_t ip6_randomid(void) { return idgen32(&ip6_id_ctx); } void ip6_randomid_init(void) { idgen32_init(&ip6_id_ctx); } /* * Compute significant parts of the IPv6 checksum pseudo-header * for use in a delayed TCP/UDP checksum calculation. */ static __inline u_int16_t __attribute__((__unused__)) in6_cksum_phdr(const struct in6_addr *src, const struct in6_addr *dst, u_int32_t len, u_int32_t nxt) { u_int32_t sum = 0; const u_int16_t *w; w = (const u_int16_t *) src; sum += w[0]; if (!IN6_IS_SCOPE_EMBED(src)) sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; w = (const u_int16_t *) dst; sum += w[0]; if (!IN6_IS_SCOPE_EMBED(dst)) sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; sum += (u_int16_t)(len >> 16) + (u_int16_t)(len /*& 0xffff*/); sum += (u_int16_t)(nxt >> 16) + (u_int16_t)(nxt /*& 0xffff*/); sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); if (sum > 0xffff) sum -= 0xffff; return (sum); } /* * Process a delayed payload checksum calculation. */ void in6_delayed_cksum(struct mbuf *m, u_int8_t nxt) { int nxtp, offset; u_int16_t csum; offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxtp); if (offset <= 0 || nxtp != nxt) /* If the desired next protocol isn't found, punt. */ return; csum = (u_int16_t)(in6_cksum(m, 0, offset, m->m_pkthdr.len - offset)); switch (nxt) { case IPPROTO_TCP: offset += offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: offset += offsetof(struct udphdr, uh_sum); if (csum == 0) csum = 0xffff; break; case IPPROTO_ICMPV6: offset += offsetof(struct icmp6_hdr, icmp6_cksum); break; } if ((offset + sizeof(u_int16_t)) > m->m_len) m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); else *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; } void in6_proto_cksum_out(struct mbuf *m, struct ifnet *ifp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); /* some hw and in6_delayed_cksum need the pseudo header cksum */ if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) { int nxt, offset; u_int16_t csum; offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); csum = in6_cksum_phdr(&ip6->ip6_src, &ip6->ip6_dst, htonl(m->m_pkthdr.len - offset), htonl(nxt)); if (nxt == IPPROTO_TCP) offset += offsetof(struct tcphdr, th_sum); else if (nxt == IPPROTO_UDP) offset += offsetof(struct udphdr, uh_sum); else if (nxt == IPPROTO_ICMPV6) offset += offsetof(struct icmp6_hdr, icmp6_cksum); if ((offset + sizeof(u_int16_t)) > m->m_len) m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); else *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; } if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) { if (!ifp || !(ifp->if_capabilities & IFCAP_CSUM_TCPv6) || ip6->ip6_nxt != IPPROTO_TCP || ifp->if_bridgeidx != 0) { tcpstat_inc(tcps_outswcsum); in6_delayed_cksum(m, IPPROTO_TCP); m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */ } } else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) { if (!ifp || !(ifp->if_capabilities & IFCAP_CSUM_UDPv6) || ip6->ip6_nxt != IPPROTO_UDP || ifp->if_bridgeidx != 0) { udpstat_inc(udps_outswcsum); in6_delayed_cksum(m, IPPROTO_UDP); m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */ } } else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) { in6_delayed_cksum(m, IPPROTO_ICMPV6); m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */ } } #ifdef IPSEC struct tdb * ip6_output_ipsec_lookup(struct mbuf *m, int *error, struct inpcb *inp) { struct tdb *tdb; struct m_tag *mtag; struct tdb_ident *tdbi; /* * Check if there was an outgoing SA bound to the flow * from a transport protocol. */ /* Do we have any pending SAs to apply ? */ tdb = ipsp_spd_lookup(m, AF_INET6, sizeof(struct ip6_hdr), error, IPSP_DIRECTION_OUT, NULL, inp, 0); if (tdb == NULL) return NULL; /* Loop detection */ for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE) continue; tdbi = (struct tdb_ident *)(mtag + 1); if (tdbi->spi == tdb->tdb_spi && tdbi->proto == tdb->tdb_sproto && tdbi->rdomain == tdb->tdb_rdomain && !memcmp(&tdbi->dst, &tdb->tdb_dst, sizeof(union sockaddr_union))) { /* no IPsec needed */ return NULL; } } return tdb; } int ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro, int tunalready, int fwd) { #if NPF > 0 struct ifnet *encif; #endif struct ip6_hdr *ip6; int error; #if NPF > 0 /* * Packet filter */ if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL || pf_test(AF_INET6, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) { m_freem(m); return EACCES; } if (m == NULL) return 0; /* * PF_TAG_REROUTE handling or not... * Packet is entering IPsec so the routing is * already overruled by the IPsec policy. * Until now the change was not reconsidered. * What's the behaviour? */ in6_proto_cksum_out(m, encif); #endif /* Check if we are allowed to fragment */ ip6 = mtod(m, struct ip6_hdr *); if (ip_mtudisc && tdb->tdb_mtu && sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { struct rtentry *rt = NULL; int rt_mtucloned = 0; int transportmode = 0; transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET6) && (IN6_ARE_ADDR_EQUAL(&tdb->tdb_dst.sin6.sin6_addr, &ip6->ip6_dst)); /* Find a host route to store the mtu in */ if (ro != NULL) rt = ro->ro_rt; /* but don't add a PMTU route for transport mode SAs */ if (transportmode) rt = NULL; else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) { struct sockaddr_in6 sin6; memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = ip6->ip6_dst; sin6.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.ph_ifidx, &ip6->ip6_dst); error = in6_embedscope(&ip6->ip6_dst, &sin6, NULL); if (error) { /* should be impossible */ ipsecstat_inc(ipsec_odrops); m_freem(m); return error; } rt = icmp6_mtudisc_clone(&sin6, m->m_pkthdr.ph_rtableid, 1); rt_mtucloned = 1; } DPRINTF(("%s: spi %08x mtu %d rt %p cloned %d\n", __func__, ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned)); if (rt != NULL) { rt->rt_mtu = tdb->tdb_mtu; if (ro != NULL && ro->ro_rt != NULL) { rtfree(ro->ro_rt); ro->ro_rt = rtalloc(sin6tosa(&ro->ro_dst), RT_RESOLVE, m->m_pkthdr.ph_rtableid); } if (rt_mtucloned) rtfree(rt); } ipsec_adjust_mtu(m, tdb->tdb_mtu); m_freem(m); return EMSGSIZE; } /* propagate don't fragment for v6-over-v6 */ if (ip_mtudisc) SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); /* * Clear these -- they'll be set in the recursive invocation * as needed. */ m->m_flags &= ~(M_BCAST | M_MCAST); /* Callee frees mbuf */ error = ipsp_process_packet(m, tdb, AF_INET6, tunalready); if (error) { ipsecstat_inc(ipsec_odrops); tdb->tdb_odrops++; } return error; } #endif /* IPSEC */
54 194 40 194 34 26 2 /* $OpenBSD: kern_task.c,v 1.31 2020/08/01 08:40:20 anton Exp $ */ /* * Copyright (c) 2013 David Gwynne <dlg@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/kthread.h> #include <sys/task.h> #include <sys/proc.h> #include <sys/witness.h> #include "kcov.h" #if NKCOV > 0 #include <sys/kcov.h> #endif #ifdef WITNESS static struct lock_type taskq_lock_type = { .lt_name = "taskq" }; #define TASKQ_LOCK_FLAGS LO_WITNESS | LO_INITIALIZED | LO_SLEEPABLE | \ (LO_CLASS_RWLOCK << LO_CLASSSHIFT) #endif /* WITNESS */ struct taskq_thread { SLIST_ENTRY(taskq_thread) tt_entry; struct proc *tt_thread; }; SLIST_HEAD(taskq_threads, taskq_thread); struct taskq { enum { TQ_S_CREATED, TQ_S_RUNNING, TQ_S_DESTROYED } tq_state; unsigned int tq_running; unsigned int tq_nthreads; unsigned int tq_flags; const char *tq_name; struct mutex tq_mtx; struct task_list tq_worklist; struct taskq_threads tq_threads; unsigned int tq_barriers; unsigned int tq_bgen; unsigned int tq_bthreads; #ifdef WITNESS struct lock_object tq_lock_object; #endif }; static const char taskq_sys_name[] = "systq"; struct taskq taskq_sys = { .tq_state = TQ_S_CREATED, .tq_running = 0, .tq_nthreads = 1, .tq_flags = 0, .tq_name = taskq_sys_name, .tq_mtx = MUTEX_INITIALIZER_FLAGS(IPL_HIGH, taskq_sys_name, 0), .tq_worklist = TAILQ_HEAD_INITIALIZER(taskq_sys.tq_worklist), .tq_threads = SLIST_HEAD_INITIALIZER(taskq_sys.tq_threads), .tq_barriers = 0, .tq_bgen = 0, .tq_bthreads = 0, #ifdef WITNESS .tq_lock_object = { .lo_name = taskq_sys_name, .lo_flags = TASKQ_LOCK_FLAGS, }, #endif }; static const char taskq_sys_mp_name[] = "systqmp"; struct taskq taskq_sys_mp = { .tq_state = TQ_S_CREATED, .tq_running = 0, .tq_nthreads = 1, .tq_flags = TASKQ_MPSAFE, .tq_name = taskq_sys_mp_name, .tq_mtx = MUTEX_INITIALIZER_FLAGS(IPL_HIGH, taskq_sys_mp_name, 0), .tq_worklist = TAILQ_HEAD_INITIALIZER(taskq_sys_mp.tq_worklist), .tq_threads = SLIST_HEAD_INITIALIZER(taskq_sys_mp.tq_threads), .tq_barriers = 0, .tq_bgen = 0, .tq_bthreads = 0, #ifdef WITNESS .tq_lock_object = { .lo_name = taskq_sys_mp_name, .lo_flags = TASKQ_LOCK_FLAGS, }, #endif }; struct taskq *const systq = &taskq_sys; struct taskq *const systqmp = &taskq_sys_mp; void taskq_init(void); /* called in init_main.c */ void taskq_create_thread(void *); void taskq_barrier_task(void *); int taskq_sleep(const volatile void *, struct mutex *, int, const char *, int); int taskq_next_work(struct taskq *, struct task *); void taskq_thread(void *); void taskq_init(void) { WITNESS_INIT(&systq->tq_lock_object, &taskq_lock_type); kthread_create_deferred(taskq_create_thread, systq); WITNESS_INIT(&systqmp->tq_lock_object, &taskq_lock_type); kthread_create_deferred(taskq_create_thread, systqmp); } struct taskq * taskq_create(const char *name, unsigned int nthreads, int ipl, unsigned int flags) { struct taskq *tq; tq = malloc(sizeof(*tq), M_DEVBUF, M_WAITOK); if (tq == NULL) return (NULL); tq->tq_state = TQ_S_CREATED; tq->tq_running = 0; tq->tq_nthreads = nthreads; tq->tq_name = name; tq->tq_flags = flags; mtx_init_flags(&tq->tq_mtx, ipl, name, 0); TAILQ_INIT(&tq->tq_worklist); SLIST_INIT(&tq->tq_threads); tq->tq_barriers = 0; tq->tq_bgen = 0; tq->tq_bthreads = 0; #ifdef WITNESS memset(&tq->tq_lock_object, 0, sizeof(tq->tq_lock_object)); tq->tq_lock_object.lo_name = name; tq->tq_lock_object.lo_flags = TASKQ_LOCK_FLAGS; witness_init(&tq->tq_lock_object, &taskq_lock_type); #endif /* try to create a thread to guarantee that tasks will be serviced */ kthread_create_deferred(taskq_create_thread, tq); return (tq); } void taskq_destroy(struct taskq *tq) { mtx_enter(&tq->tq_mtx); switch (tq->tq_state) { case TQ_S_CREATED: /* tq is still referenced by taskq_create_thread */ tq->tq_state = TQ_S_DESTROYED; mtx_leave(&tq->tq_mtx); return; case TQ_S_RUNNING: tq->tq_state = TQ_S_DESTROYED; break; default: panic("unexpected %s tq state %u", tq->tq_name, tq->tq_state); } while (tq->tq_running > 0) { wakeup(tq); msleep_nsec(&tq->tq_running, &tq->tq_mtx, PWAIT, "tqdestroy", INFSLP); } mtx_leave(&tq->tq_mtx); free(tq, M_DEVBUF, sizeof(*tq)); } void taskq_create_thread(void *arg) { struct taskq *tq = arg; int rv; mtx_enter(&tq->tq_mtx); switch (tq->tq_state) { case TQ_S_DESTROYED: mtx_leave(&tq->tq_mtx); free(tq, M_DEVBUF, sizeof(*tq)); return; case TQ_S_CREATED: tq->tq_state = TQ_S_RUNNING; break; default: panic("unexpected %s tq state %d", tq->tq_name, tq->tq_state); } do { tq->tq_running++; mtx_leave(&tq->tq_mtx); rv = kthread_create(taskq_thread, tq, NULL, tq->tq_name); mtx_enter(&tq->tq_mtx); if (rv != 0) { printf("unable to create thread for \"%s\" taskq\n", tq->tq_name); tq->tq_running--; /* could have been destroyed during kthread_create */ if (tq->tq_state == TQ_S_DESTROYED && tq->tq_running == 0) wakeup_one(&tq->tq_running); break; } } while (tq->tq_running < tq->tq_nthreads); mtx_leave(&tq->tq_mtx); } void taskq_barrier_task(void *p) { struct taskq *tq = p; unsigned int gen; mtx_enter(&tq->tq_mtx); tq->tq_bthreads++; wakeup(&tq->tq_bthreads); gen = tq->tq_bgen; do { msleep_nsec(&tq->tq_bgen, &tq->tq_mtx, PWAIT, "tqbarend", INFSLP); } while (gen == tq->tq_bgen); mtx_leave(&tq->tq_mtx); } static void taskq_do_barrier(struct taskq *tq) { struct task t = TASK_INITIALIZER(taskq_barrier_task, tq); struct proc *thread = curproc; struct taskq_thread *tt; mtx_enter(&tq->tq_mtx); tq->tq_barriers++; /* is the barrier being run from a task inside the taskq? */ SLIST_FOREACH(tt, &tq->tq_threads, tt_entry) { if (tt->tt_thread == thread) { tq->tq_bthreads++; wakeup(&tq->tq_bthreads); break; } } while (tq->tq_bthreads < tq->tq_nthreads) { /* shove the task into the queue for a worker to pick up */ SET(t.t_flags, TASK_ONQUEUE); TAILQ_INSERT_TAIL(&tq->tq_worklist, &t, t_entry); wakeup_one(tq); msleep_nsec(&tq->tq_bthreads, &tq->tq_mtx, PWAIT, "tqbar", INFSLP); /* * another thread running a barrier might have * done this work for us. */ if (ISSET(t.t_flags, TASK_ONQUEUE)) TAILQ_REMOVE(&tq->tq_worklist, &t, t_entry); } if (--tq->tq_barriers == 0) { /* we're the last one out */ tq->tq_bgen++; wakeup(&tq->tq_bgen); tq->tq_bthreads = 0; } else { unsigned int gen = tq->tq_bgen; do { msleep_nsec(&tq->tq_bgen, &tq->tq_mtx, PWAIT, "tqbarwait", INFSLP); } while (gen == tq->tq_bgen); } mtx_leave(&tq->tq_mtx); } void taskq_barrier(struct taskq *tq) { WITNESS_CHECKORDER(&tq->tq_lock_object, LOP_NEWORDER, NULL); taskq_do_barrier(tq); } void taskq_del_barrier(struct taskq *tq, struct task *t) { WITNESS_CHECKORDER(&tq->tq_lock_object, LOP_NEWORDER, NULL); if (task_del(tq, t)) return; taskq_do_barrier(tq); } void task_set(struct task *t, void (*fn)(vo